diff --git a/config/arch.aarch64 b/config/arch.aarch64
index 866b7e69ba..eac9e22b8c 100644
--- a/config/arch.aarch64
+++ b/config/arch.aarch64
@@ -11,7 +11,7 @@
   fi
 
 # TARGET_CPU:
-# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72
+# generic cortex-a35 cortex-a53 cortex-a57 cortex-a72 cortex-a76
 # exynos-m1 qdf24xx thunderx xgene1 cortex-a57.cortex-a53
 # cortex-a72.cortex-a53
 
@@ -21,6 +21,10 @@
       TARGET_SUBARCH=aarch64
       TARGET_VARIANT=armv8-a
       ;;
+    cortex-a76)
+      TARGET_SUBARCH=aarch64
+      TARGET_VARIANT=armv8.2-a
+      ;;
   esac
 
   TARGET_GCC_ARCH=${TARGET_SUBARCH/-}
diff --git a/config/noobs/partition_setup.sh b/config/noobs/partition_setup.sh
index 36e8cb6e92..c0dd86349a 100755
--- a/config/noobs/partition_setup.sh
+++ b/config/noobs/partition_setup.sh
@@ -66,7 +66,7 @@ fi
 
 # create bootloader configuration
   echo "creating bootloader configuration..."
-  echo "boot=$id1 disk=$id2 quiet" > $MOUNTPOINT/cmdline.txt
+  echo "boot=$id1 disk=$id2 quiet @EXTRA_CMDLINE@" > $MOUNTPOINT/cmdline.txt
 
 # cleanup mountpoint
   umount $MOUNTPOINT
diff --git a/licenses/FLIRC.txt b/licenses/FLIRC.txt
new file mode 100644
index 0000000000..d135582636
--- /dev/null
+++ b/licenses/FLIRC.txt
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2023 Flirc Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    1. Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *    2. Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials
+ *       provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY FLIRC INC. \`\`AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ${AUTHOR_UPPER} OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ * DAMAGE.
+ *
+ * The views and conclusions contained in the software and documentation
+ * are those of the authors and should not be interpreted as representing
+ * official policies, either expressed or implied, of Flirc Inc.
+ */
diff --git a/licenses/HIDAPI-orig.txt b/licenses/HIDAPI-orig.txt
new file mode 100644
index 0000000000..e3f3380829
--- /dev/null
+++ b/licenses/HIDAPI-orig.txt
@@ -0,0 +1,9 @@
+ HIDAPI - Multi-Platform library for
+ communication with HID devices.
+
+ Copyright 2009, Alan Ott, Signal 11 Software.
+ All Rights Reserved.
+ 
+ This software may be used by anyone for any reason so
+ long as the copyright notice in the source files
+ remains intact.
diff --git a/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk b/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk
index eaafbf45c7..64dc139cac 100644
--- a/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk
+++ b/packages/addons/addon-depends/dotnet-runtime-depends/aspnet6-runtime/package.mk
@@ -2,7 +2,7 @@
 # Copyright (C) 2022-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="aspnet6-runtime"
-PKG_VERSION="6.0.20"
+PKG_VERSION="6.0.24"
 PKG_LICENSE="MIT"
 PKG_SITE="https://dotnet.microsoft.com/"
 PKG_DEPENDS_TARGET="toolchain"
@@ -11,16 +11,16 @@ PKG_TOOLCHAIN="manual"
 
 case "${ARCH}" in
   "aarch64")
-    PKG_SHA256="dd1898babdba27c57338b17afd4513a53025dec0985047d030336aab65532e26"
-    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/a8a1a993-ddd9-4bcd-8386-d9defcf0fd29/4b471f72c8253fa1462ea923d0fe39a2/aspnetcore-runtime-6.0.20-linux-arm64.tar.gz"
+    PKG_SHA256="ee6b660b3c8b3fb88eb64690ac78a47752dae68c21647fccdc5f810bc68829ab"
+    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/d562ba2b-8e2c-48e5-9853-f8616a9cb4e4/f4e251ba67b718083c28017e3b0c6349/aspnetcore-runtime-6.0.24-linux-arm64.tar.gz"
     ;;
   "arm")
-    PKG_SHA256="f26a0f36339056d65522254c4bf333c940abc3dee907d4219a64cc1456b63fe3"
-    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/872ccb13-fbc4-4d75-9d8f-be3fec5581ef/add2199206c438835b7b48a6d061b023/aspnetcore-runtime-6.0.20-linux-arm.tar.gz"
+    PKG_SHA256="634b0ecd7312e8a46adedcbff6e1b23e514fa153f7135a6b9f6aefb5851f9d88"
+    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/9c00fe25-e1e0-4390-9061-77d07e95356f/09886ffeaed522c3fa8803e879ce070c/aspnetcore-runtime-6.0.24-linux-arm.tar.gz"
     ;;
   "x86_64")
-    PKG_SHA256="88afcf5b6434c6a4ee12488d8bc13f84c15191712d12eb9646cf3642b9c01e86"
-    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/972dc929-4c16-4456-a7c8-64014f80678d/a3b62252f98a0d7e0c0a9a01ede18776/aspnetcore-runtime-6.0.20-linux-x64.tar.gz"
+    PKG_SHA256="022dc914af7490bcd2d885edeb5d4c1faa4b771b503b8059d5181f130191cf2c"
+    PKG_URL="https://download.visualstudio.microsoft.com/download/pr/8f5a65c0-9bc8-497d-9ce2-4658c461dc55/b6c01c3cd060552d987501ba6bbde09f/aspnetcore-runtime-6.0.24-linux-x64.tar.gz"
     ;;
 esac
 PKG_SOURCE_NAME="aspnetcore-runtime_${PKG_VERSION}_${ARCH}.tar.gz"
diff --git a/packages/addons/addon-depends/hidapi/package.mk b/packages/addons/addon-depends/hidapi/package.mk
new file mode 100644
index 0000000000..bee13ef94b
--- /dev/null
+++ b/packages/addons/addon-depends/hidapi/package.mk
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv)
+
+PKG_NAME="hidapi"
+PKG_VERSION="0.14.0"
+PKG_SHA256="a5714234abe6e1f53647dd8cba7d69f65f71c558b7896ed218864ffcf405bcbd"
+PKG_LICENSE="HIDAPI-orig"
+PKG_SITE="http://libusb.info/"
+PKG_URL="https://github.com/libusb/hidapi/archive/refs/tags/hidapi-${PKG_VERSION}.tar.gz"
+PKG_DEPENDS_TARGET="toolchain libusb"
+PKG_LONGDESC="HIDAPI is a multi-platform library which allows an application to interface with USB and Bluetooth HID-Class devices."
+PKG_TOOLCHAIN="cmake"
diff --git a/packages/addons/addon-depends/libzip/package.mk b/packages/addons/addon-depends/libzip/package.mk
index 919470a2d8..ac6129091a 100644
--- a/packages/addons/addon-depends/libzip/package.mk
+++ b/packages/addons/addon-depends/libzip/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="libzip"
-PKG_VERSION="1.9.2"
-PKG_SHA256="c93e9852b7b2dc931197831438fee5295976ee0ba24f8524a8907be5c2ba5937"
+PKG_VERSION="1.10.1"
+PKG_SHA256="dc3c8d5b4c8bbd09626864f6bcf93de701540f761d76b85d7c7d710f4bd90318"
 PKG_LICENSE="GPL"
 PKG_SITE="https://libzip.org/"
 PKG_URL="https://libzip.org/download/${PKG_NAME}-${PKG_VERSION}.tar.xz"
@@ -16,6 +16,7 @@ PKG_CMAKE_OPTS_TARGET="-DENABLE_COMMONCRYPTO=OFF \
                        -DENABLE_MBEDTLS=OFF \
                        -DENABLE_OPENSSL=OFF \
                        -DENABLE_WINDOWS_CRYPTO=OFF \
+                       -DENABLE_ZSTD=OFF \
                        -DBUILD_TOOLS=OFF \
                        -DBUILD_REGRESS=OFF \
                        -DBUILD_EXAMPLES=OFF \
diff --git a/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk
index 97927441d6..229961b233 100644
--- a/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk
+++ b/packages/addons/addon-depends/multimedia-tools-depends/depends/libmediainfo/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="libmediainfo"
-PKG_VERSION="22.12"
-PKG_SHA256="0fc6d32f06d6ce5e144074d2e57e0db8dfa4e38e752d3123ada27ccaf89634bc"
+PKG_VERSION="23.07"
+PKG_SHA256="60456c8b2ab8769a6081d96fd7be86db4fe32520e4a022397cb22cacf47ce820"
 PKG_LICENSE="GPL"
 PKG_SITE="https://mediaarea.net/en/MediaInfo/Download/Source"
 PKG_URL="https://mediaarea.net/download/source/libmediainfo/${PKG_VERSION}/libmediainfo_${PKG_VERSION}.tar.xz"
diff --git a/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk
index 603bfbbbaf..bb354ad42e 100644
--- a/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk
+++ b/packages/addons/addon-depends/multimedia-tools-depends/depends/libzen/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="libzen"
-PKG_VERSION="0.4.40"
-PKG_SHA256="0c2e1c7302b3ee260d34b52e4b16ab655bdf021db8c14653e418aced46eb24a7"
+PKG_VERSION="0.4.41"
+PKG_SHA256="933bad3b7ecd29dc6bdc88a83645c83dfd098c15b0b90d6177a37fa1536704e8"
 PKG_LICENSE="GPL"
 PKG_SITE="https://mediaarea.net/en/MediaInfo/"
 PKG_URL="https://mediaarea.net/download/source/libzen/${PKG_VERSION}/libzen_${PKG_VERSION}.tar.xz"
diff --git a/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk
index 67d98b8d09..1d371399b4 100644
--- a/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk
+++ b/packages/addons/addon-depends/multimedia-tools-depends/mediainfo/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="mediainfo"
-PKG_VERSION="22.12"
-PKG_SHA256="932b82739f738e7db603cf5bb170720731a9e7c61d145c2a54aabb3cd0b753bc"
+PKG_VERSION="23.07"
+PKG_SHA256="b6d7da9e29995fd34a22100825b843e74c32c7bc67adb01166b1beedea49f5d0"
 PKG_LICENSE="GPL"
 PKG_SITE="https://mediaarea.net/en/MediaInfo/Download/Source"
 PKG_URL="https://mediaarea.net/download/source/mediainfo/${PKG_VERSION}/mediainfo_${PKG_VERSION}.tar.xz"
diff --git a/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk b/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk
index d7bcdc1797..774181eb02 100644
--- a/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk
+++ b/packages/addons/addon-depends/multimedia-tools-depends/mpv-drmprime/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="mpv-drmprime"
-PKG_VERSION="0.35.1"
-PKG_SHA256="41df981b7b84e33a2ef4478aaf81d6f4f5c8b9cd2c0d337ac142fc20b387d1a9"
+PKG_VERSION="0.36.0"
+PKG_SHA256="29abc44f8ebee013bb2f9fe14d80b30db19b534c679056e4851ceadf5a5e8bf6"
 PKG_LICENSE="GPL"
 PKG_SITE="https://mpv.io/"
 PKG_URL="https://github.com/mpv-player/mpv/archive/v${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/opus/package.mk b/packages/addons/addon-depends/opus/package.mk
index 06881c6bca..572f9019aa 100644
--- a/packages/addons/addon-depends/opus/package.mk
+++ b/packages/addons/addon-depends/opus/package.mk
@@ -2,11 +2,11 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="opus"
-PKG_VERSION="1.3.1"
-PKG_SHA256="65b58e1e25b2a114157014736a3d9dfeaad8d41be1c8179866f144a2fb44ff9d"
+PKG_VERSION="1.4"
+PKG_SHA256="c9b32b4253be5ae63d1ff16eea06b94b5f0f2951b7a02aceef58e3a3ce49c51f"
 PKG_LICENSE="BSD"
 PKG_SITE="http://www.opus-codec.org"
-PKG_URL="https://archive.mozilla.org/pub/opus/${PKG_NAME}-${PKG_VERSION}.tar.gz"
+PKG_URL="https://github.com/xiph/opus/releases/download/v${PKG_VERSION}/${PKG_NAME}-${PKG_VERSION}.tar.gz"
 PKG_DEPENDS_TARGET="toolchain"
 PKG_LONGDESC="Codec designed for interactive speech and audio transmission over the Internet."
 PKG_TOOLCHAIN="configure"
diff --git a/packages/addons/addon-depends/pngquant/package.mk b/packages/addons/addon-depends/pngquant/package.mk
index 1e8f8ff1d5..e92d89f636 100644
--- a/packages/addons/addon-depends/pngquant/package.mk
+++ b/packages/addons/addon-depends/pngquant/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pngquant"
-PKG_VERSION="2.17.0"
-PKG_SHA256="a27cf0e64db499ccb3ddae9b36036e881f78293e46ec27a9e7a86a3802fcda66"
+PKG_VERSION="2.18.0"
+PKG_SHA256="e72194b52b36f040deaec49a1ddd5dcd8d4feecc3a5fe6c5e9589a9707b233d4"
 PKG_LICENSE="GPLv3"
 PKG_SITE="https://pngquant.org"
 PKG_URL="https://pngquant.org/pngquant-${PKG_VERSION}-src.tar.gz"
diff --git a/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk b/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk
index 33b1d61bfb..13b6596201 100644
--- a/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk
+++ b/packages/addons/addon-depends/rsyslog-depends/libfastjson/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="libfastjson"
-PKG_VERSION="0.99.9"
-PKG_SHA256="a330e1bdef3096b7ead53b4bad1a6158f19ba9c9ec7c36eda57de7729d84aaee"
+PKG_VERSION="1.2304.0"
+PKG_SHA256="ef30d1e57a18ec770f90056aaac77300270c6203bbe476f4181cc83a2d5dc80c"
 PKG_LICENSE="GPL"
 PKG_SITE="https://www.rsyslog.com/tag/libfastjson"
 PKG_URL="https://download.rsyslog.com/libfastjson/${PKG_NAME}-${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/bottom/package.mk b/packages/addons/addon-depends/system-tools-depends/bottom/package.mk
index e724304e57..ade916dd76 100644
--- a/packages/addons/addon-depends/system-tools-depends/bottom/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/bottom/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2020-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="bottom"
-PKG_VERSION="0.9.3"
-PKG_SHA256="53a1466c3d2ed8f38401e8929cf2da796e703e4d70339d215f855b2304c07f72"
+PKG_VERSION="0.9.4"
+PKG_SHA256="199123ef354bcabaa8a2e3b7b477b324f5b647d503a2599d08296733846eea6e"
 PKG_LICENSE="MIT"
 PKG_SITE="https://github.com/ClementTsang/bottom"
 PKG_URL="https://github.com/ClementTsang/bottom/archive/${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk b/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk
index cfb9a6647b..806c1d07ff 100644
--- a/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/depends/libmtp/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="libmtp"
-PKG_VERSION="1.1.20"
-PKG_SHA256="c9191dac2f5744cf402e08641610b271f73ac21a3c802734ec2cedb2c6bc56d0"
+PKG_VERSION="1.1.21"
+PKG_SHA256="c4ffa5ab8c8f48c91b0047f2e253c101c418d5696a5ed65c839922a4280872a7"
 PKG_LICENSE="GPL"
 PKG_SITE="http://libmtp.sourceforge.net/"
 PKG_URL="${SOURCEFORGE_SRC}/project/${PKG_NAME}/${PKG_NAME}/${PKG_VERSION}/${PKG_NAME}-${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/mc/package.mk b/packages/addons/addon-depends/system-tools-depends/mc/package.mk
index 40d0e8c829..49b78d3082 100644
--- a/packages/addons/addon-depends/system-tools-depends/mc/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/mc/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="mc"
-PKG_VERSION="4.8.29"
-PKG_SHA256="01d8a3b94f58180cca5bf17257b5078d1fd6fd27a9b5c0e970ec767549540ad4"
+PKG_VERSION="4.8.30"
+PKG_SHA256="5ebc3cb2144b970c5149fda556c4ad50b78780494696cdf2d14a53204c95c7df"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.midnight-commander.org"
 PKG_URL="http://ftp.midnight-commander.org/mc-${PKG_VERSION}.tar.xz"
diff --git a/packages/addons/addon-depends/system-tools-depends/screen/package.mk b/packages/addons/addon-depends/system-tools-depends/screen/package.mk
index 2c8abfd88a..abac4b4d47 100644
--- a/packages/addons/addon-depends/system-tools-depends/screen/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/screen/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="screen"
-PKG_VERSION="4.9.0"
-PKG_SHA256="f9335281bb4d1538ed078df78a20c2f39d3af9a4e91c57d084271e0289c730f4"
+PKG_VERSION="4.9.1"
+PKG_SHA256="26cef3e3c42571c0d484ad6faf110c5c15091fbf872b06fa7aa4766c7405ac69"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.gnu.org/software/screen/"
 PKG_URL="http://ftpmirror.gnu.org/screen/${PKG_NAME}-${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk b/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk
index 6f8efef13c..ecb78ff7c1 100644
--- a/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/smartmontools/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="smartmontools"
-PKG_VERSION="7.3"
-PKG_SHA256="a544f8808d0c58cfb0e7424ca1841cb858a974922b035d505d4e4c248be3a22b"
+PKG_VERSION="7.4"
+PKG_SHA256="e9a61f641ff96ca95319edfb17948cd297d0cd3342736b2c49c99d4716fb993d"
 PKG_LICENSE="GPL"
 PKG_SITE="https://www.smartmontools.org"
 PKG_URL="https://downloads.sourceforge.net/sourceforge/smartmontools/smartmontools-${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk b/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk
index 5dda3fe52b..dd5abf908b 100644
--- a/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk
+++ b/packages/addons/addon-depends/system-tools-depends/stress-ng/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="stress-ng"
-PKG_VERSION="0.16.02"
-PKG_SHA256="71ac375826cc58dcbcf5f1609959ed1a5afd71192c52025b5cb273baa3df2317"
+PKG_VERSION="0.16.04"
+PKG_SHA256="3453719508e9e02c57a736c154408538372d078be7dcf8e0165d37a821cdba45"
 PKG_LICENSE="GPLv2"
 PKG_SITE="https://github.com/ColinIanKing/stress-ng"
 PKG_URL="https://github.com/ColinIanKing/stress-ng/archive/refs/tags/V${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch b/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch
deleted file mode 100644
index cc4a558133..0000000000
--- a/packages/addons/addon-depends/system-tools-depends/stress-ng/patches/stress-ng-0001-workaround-cross-compilation-issue.patch
+++ /dev/null
@@ -1,11 +0,0 @@
---- a/Makefile.config	2023-07-13 07:58:21.000000000 +0000
-+++ b/Makefile.config	2023-07-16 05:05:35.766646855 +0000
-@@ -311,7 +311,7 @@
- compiler: configdir
- 	@echo "checking compiler ..."
- 	@$(CC) test/test-compiler.c -o test/test-compiler
--	@echo "" > $(CONFIGS)/$$(./test/test-compiler)
-+	@echo "" > $(CONFIGS)/HAVE_COMPILER_GCC
- 	@rm -f test/test-compiler
- 	$(call check,test-glibc,HAVE_GLIBC,using glibc)
- 
diff --git a/packages/addons/service/librespot/changelog.txt b/packages/addons/service/librespot/changelog.txt
index df2757fa99..ae4ccbae71 100644
--- a/packages/addons/service/librespot/changelog.txt
+++ b/packages/addons/service/librespot/changelog.txt
@@ -2,3 +2,5 @@
 - update librespot to githash 03b547d (2023-04-16)
 2
 - update librespot to githash c964102 (2023-05-14)
+3
+- update librespot to githash f037e46 (2023-07-19)
diff --git a/packages/addons/service/librespot/package.mk b/packages/addons/service/librespot/package.mk
index bcaf03d544..0059e8f4b2 100644
--- a/packages/addons/service/librespot/package.mk
+++ b/packages/addons/service/librespot/package.mk
@@ -3,10 +3,10 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="librespot"
-PKG_VERSION="c964102a349589d644baef5f43a566d6d1e151f1"
-PKG_VERSION_DATE="2023-05-14"
-PKG_SHA256="3bc6782d8796253040d995043fba4e6f6c71ff902da110b50398071e082b1930"
-PKG_REV="2"
+PKG_VERSION="f037e46aee631837a0553ccfdbc7866752fd0f5d"
+PKG_VERSION_DATE="2023-07-19"
+PKG_SHA256="72ec541fda77ea6a2132dd67f68a89437bfc13513481d5abbdde535976fc60c3"
+PKG_REV="3"
 PKG_ARCH="any"
 PKG_LICENSE="MIT"
 PKG_SITE="https://github.com/librespot-org/librespot/"
diff --git a/packages/addons/service/mariadb/changelog.txt b/packages/addons/service/mariadb/changelog.txt
index 3a0ffce00b..7c7ea4e5f2 100644
--- a/packages/addons/service/mariadb/changelog.txt
+++ b/packages/addons/service/mariadb/changelog.txt
@@ -1,3 +1,6 @@
+2
+- mariadb: update to 10.11.5
+
 1
 - include mariadb-upgrade and mariadb-check
   copy mariadb* binaries and make symbolic links to mysql*
diff --git a/packages/addons/service/mariadb/package.mk b/packages/addons/service/mariadb/package.mk
index b25ec2b9ad..c88e713af2 100644
--- a/packages/addons/service/mariadb/package.mk
+++ b/packages/addons/service/mariadb/package.mk
@@ -2,9 +2,9 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="mariadb"
-PKG_VERSION="10.11.2"
-PKG_REV="1"
-PKG_SHA256="1c89dee0caed0f68bc2a1d203eb98a123150e6a179f6ee0f1fc0ba3f08dc71dc"
+PKG_VERSION="10.11.5"
+PKG_REV="2"
+PKG_SHA256="4c9484048d4d0c71dd076ab33fc2a9ce8510bdf762886de0d63fe52496f3dbbb"
 PKG_LICENSE="GPL2"
 PKG_SITE="https://mariadb.org"
 PKG_URL="https://downloads.mariadb.com/MariaDB/${PKG_NAME}-${PKG_VERSION}/source/${PKG_NAME}-${PKG_VERSION}.tar.gz"
diff --git a/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch b/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch
index dc8c6d842d..721ab3735d 100644
--- a/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch
+++ b/packages/addons/service/mariadb/patches/mariadb-0001-disable-plugin-auth-pam.patch
@@ -11,7 +11,7 @@ diff --git a/cmake/build_configurations/mysql_release.cmake b/cmake/build_config
 index 37a6c45..e2a4ba8 100644
 --- a/cmake/build_configurations/mysql_release.cmake
 +++ b/cmake/build_configurations/mysql_release.cmake
-@@ -124,7 +124,7 @@ ENDIF()
+@@ -147,7 +147,7 @@ ENDIF()
  
  IF(UNIX)
    SET(WITH_EXTRA_CHARSETS all CACHE STRING "")
@@ -19,7 +19,7 @@ index 37a6c45..e2a4ba8 100644
 +  SET(PLUGIN_AUTH_PAM NO CACHE BOOL "")
  
    IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-     IF(NOT IGNORE_AIO_CHECK)
+     FIND_PACKAGE(URING)
 -- 
 2.7.4
 
diff --git a/packages/addons/service/net-snmp/changelog.txt b/packages/addons/service/net-snmp/changelog.txt
index 927b2407d4..df8d025e2c 100644
--- a/packages/addons/service/net-snmp/changelog.txt
+++ b/packages/addons/service/net-snmp/changelog.txt
@@ -1 +1,2 @@
-initial release
+1
+- net-snmp: update to 5.9.4
diff --git a/packages/addons/service/net-snmp/package.mk b/packages/addons/service/net-snmp/package.mk
index 64c06edf4d..d529317bad 100644
--- a/packages/addons/service/net-snmp/package.mk
+++ b/packages/addons/service/net-snmp/package.mk
@@ -2,9 +2,9 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="net-snmp"
-PKG_VERSION="5.9.3"
-PKG_SHA256="2097f29b7e1bf3f1300b4bae52fa2308d0bb8d5d3998dbe02f9462a413a2ef0a"
-PKG_REV="0"
+PKG_VERSION="5.9.4"
+PKG_SHA256="8b4de01391e74e3c7014beb43961a2d6d6fa03acc34280b9585f4930745b0544"
+PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="BSD"
 PKG_SITE="http://www.net-snmp.org"
diff --git a/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch b/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch
index 45dfcf9f09..ce6bf1136b 100644
--- a/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch
+++ b/packages/addons/service/net-snmp/patches/net-snmp-0002-net-snmp-create-v3-user.in.patch
@@ -1,14 +1,14 @@
 --- net-snmp-5.9/net-snmp-create-v3-user.in	2020-08-14 21:41:47.000000000 +0000
 +++ net-snmp-5.9/net-snmp-create-v3-user.in	2021-01-14 07:04:26.196982169 +0000
-@@ -5,10 +5,8 @@
+@@ -3,10 +3,8 @@
  # this shell script is designed to add new SNMPv3 users
  # to Net-SNMP config file.
  
--if @PSCMD@ | egrep ' snmpd *$' > /dev/null 2>&1 ; then
--    echo "Apparently at least one snmpd demon is already running."
+-if @PSCMD@ | @EGREP@ ' snmpd *$' > /dev/null 2>&1 ; then
+-    echo "Apparently at least one snmpd daemon is already running."
 -    echo "You must stop them in order to use this command."
 -    exit 1
-+if @PSCMD@ | egrep 'snmpd'> /dev/null 2>&1 ; then
++if @PSCMD@ | @EGREP@ 'snmpd' > /dev/null 2>&1 ; then
 +    systemctl stop service.net-snmp.service
  fi
  
diff --git a/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch b/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch
deleted file mode 100644
index 0da33713be..0000000000
--- a/packages/addons/service/net-snmp/patches/net-snmp-0003-config.sub.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-diff -ur net-snmp-5.7.3/config.sub net-snmp-5.7.3.new/config.sub
---- net-snmp-5.7.3/config.sub	2014-12-08 21:23:22.000000000 +0100
-+++ net-snmp-5.7.3.new/config.sub	2017-11-09 18:39:31.638689732 +0100
-@@ -247,10 +247,11 @@ case $basic_machine in
- 	# Some are omitted here because they have special meanings below.
- 	1750a | 580 \
- 	| a29k \
-+	| aarch64 | aarch64_be \
- 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
- 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
- 	| am33_2.0 \
--	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
-+	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
- 	| bfin \
- 	| c4x | clipper \
- 	| d10v | d30v | dlx | dsp16xx \
-@@ -339,6 +340,7 @@ case $basic_machine in
- 	# Recognize the basic CPU types with company name.
- 	580-* \
- 	| a29k-* \
-+	| aarch64-* | aarch64_be-* \
- 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
- 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
- 	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
-@@ -1171,6 +1173,9 @@ case $basic_machine in
- 		basic_machine=hppa1.1-winbond
- 		os=-proelf
- 		;;
-+	x64)
-+		basic_machine=x86_64-pc
-+		;;
- 	xbox)
- 		basic_machine=i686-pc
- 		os=-mingw32
diff --git a/packages/addons/service/nextpvr/changelog.txt b/packages/addons/service/nextpvr/changelog.txt
index 17d0b79bb9..99dd46be22 100644
--- a/packages/addons/service/nextpvr/changelog.txt
+++ b/packages/addons/service/nextpvr/changelog.txt
@@ -1,3 +1,6 @@
+5
+- download NextPVR 6.1.5
+
 4
 - download NextPVR 6.1.4
 
diff --git a/packages/addons/service/nextpvr/package.mk b/packages/addons/service/nextpvr/package.mk
index ab1c5a05cf..d04a05de10 100644
--- a/packages/addons/service/nextpvr/package.mk
+++ b/packages/addons/service/nextpvr/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2021-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="nextpvr"
-PKG_VERSION="6.1.4~Nexus"
-PKG_ADDON_VERSION="6.1.4~4"
+PKG_VERSION="6.1.5~Nexus"
+PKG_ADDON_VERSION="6.1.5~5"
 PKG_REV="0"
 PKG_ARCH="any"
 PKG_LICENSE="NextPVR"
diff --git a/packages/addons/service/rsyslog/changelog.txt b/packages/addons/service/rsyslog/changelog.txt
index 927b2407d4..ac32f50133 100755
--- a/packages/addons/service/rsyslog/changelog.txt
+++ b/packages/addons/service/rsyslog/changelog.txt
@@ -1 +1,3 @@
-initial release
+1
+- libfastjson: update to 1.2304.0
+- rsyslog: update to 8.2308.0
diff --git a/packages/addons/service/rsyslog/package.mk b/packages/addons/service/rsyslog/package.mk
index 369f3de094..b9969cdac7 100644
--- a/packages/addons/service/rsyslog/package.mk
+++ b/packages/addons/service/rsyslog/package.mk
@@ -2,9 +2,9 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="rsyslog"
-PKG_VERSION="8.2302.0"
-PKG_SHA256="25415f85b662615ce3c83077d53758029e8743cb5929044bfd3564e3d626a3b9"
-PKG_REV="0"
+PKG_VERSION="8.2308.0"
+PKG_SHA256="02086b9121e872cea69e5d0f6c8e2d8ebff33234b3cad5503665378d3af2e3c9"
+PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://github.com/rsyslog"
diff --git a/packages/addons/tools/btrfs-progs/changelog.txt b/packages/addons/tools/btrfs-progs/changelog.txt
index a195358cf9..a514ca0bf4 100644
--- a/packages/addons/tools/btrfs-progs/changelog.txt
+++ b/packages/addons/tools/btrfs-progs/changelog.txt
@@ -1,2 +1,5 @@
+2
+- btrfs-progs: update to 6.3.3
+
 1
 - btrfs-progs: update to 6.2.2
diff --git a/packages/addons/tools/btrfs-progs/package.mk b/packages/addons/tools/btrfs-progs/package.mk
index b2e32b4e51..aade248a60 100644
--- a/packages/addons/tools/btrfs-progs/package.mk
+++ b/packages/addons/tools/btrfs-progs/package.mk
@@ -2,9 +2,9 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="btrfs-progs"
-PKG_VERSION="6.2.2"
-PKG_SHA256="140d3d98f2cba4c7f05c16aec3038f044e11555a40c27a5006185c99a10c7ca2"
-PKG_REV="1"
+PKG_VERSION="6.3.3"
+PKG_SHA256="0e55374e448ad4d8876db9c676669bedc16cb763e2493b14c245df8c5d00064b"
+PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://btrfs.wiki.kernel.org/index.php/Main_Page"
diff --git a/packages/addons/tools/dotnet-runtime/changelog.txt b/packages/addons/tools/dotnet-runtime/changelog.txt
index 87c91c385a..de8e35617e 100644
--- a/packages/addons/tools/dotnet-runtime/changelog.txt
+++ b/packages/addons/tools/dotnet-runtime/changelog.txt
@@ -1,3 +1,9 @@
+7
+- aspnet6-runtime: update to 6.0.24
+
+6
+- aspnet6-runtime: update to 6.0.21
+
 5
 - aspnet6-runtime: update to 6.0.20
 
diff --git a/packages/addons/tools/dotnet-runtime/package.mk b/packages/addons/tools/dotnet-runtime/package.mk
index d8a2408414..839ab52aa4 100644
--- a/packages/addons/tools/dotnet-runtime/package.mk
+++ b/packages/addons/tools/dotnet-runtime/package.mk
@@ -2,7 +2,7 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="dotnet-runtime"
-PKG_REV="5"
+PKG_REV="7"
 PKG_ARCH="any"
 PKG_LICENSE="MIT"
 PKG_SITE="https://dotnet.microsoft.com/"
diff --git a/packages/addons/tools/flirc_util/changelog.txt b/packages/addons/tools/flirc_util/changelog.txt
new file mode 100644
index 0000000000..927b2407d4
--- /dev/null
+++ b/packages/addons/tools/flirc_util/changelog.txt
@@ -0,0 +1 @@
+initial release
diff --git a/packages/addons/tools/flirc_util/icon/icon.png b/packages/addons/tools/flirc_util/icon/icon.png
new file mode 100644
index 0000000000..8976d44345
Binary files /dev/null and b/packages/addons/tools/flirc_util/icon/icon.png differ
diff --git a/packages/addons/tools/flirc_util/package.mk b/packages/addons/tools/flirc_util/package.mk
new file mode 100644
index 0000000000..7b3febddf9
--- /dev/null
+++ b/packages/addons/tools/flirc_util/package.mk
@@ -0,0 +1,36 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv)
+
+PKG_NAME="flirc_util"
+PKG_VERSION="8d3c86e8bb419ad44297c1b186f0cdc7dfcac915" # 30/10/2023
+PKG_SHA256="fc460e6ce5477cb6b83c90a5f8b2ebb9876ed23cdd813a6a4a0fdc3730052a2b"
+PKG_LICENSE="FLIRC"
+PKG_SITE="http://www.flirc.tv"
+PKG_URL="https://github.com/flirc/sdk/archive/${PKG_VERSION}.tar.gz"
+PKG_DEPENDS_TARGET="toolchain hidapi libusb"
+PKG_SECTION="tools"
+PKG_SHORTDESC="CLI utility for flirc IR receivers"
+PKG_LONGDESC="Command-Line utility for configuring flirc IR receivers"
+PKG_TOOLCHAIN="manual"
+
+PKG_IS_ADDON="yes"
+PKG_ADDON_NAME="flirc_util"
+PKG_ADDON_TYPE="xbmc.python.script"
+
+make_target() {
+  cd cli
+  make VERBOSE="1" \
+       CONFIG="release" \
+       HOSTOS="LIBREELEC" \
+       MACHINE="Linux_${TARGET_ARCH}" \
+       BUILDDIR_ROOT="${PKG_BUILD}/build" \
+       BUILDDIR="${PKG_BUILD}/build" \
+       LSEARCH+=" -L../libs/Linux_${TARGET_ARCH}" \
+       flirc_util
+}
+
+addon() {
+  mkdir -p ${ADDON_BUILD}/${PKG_ADDON_ID}/{bin,lib}
+    cp -P ${PKG_BUILD}/build/flirc_util ${ADDON_BUILD}/${PKG_ADDON_ID}/bin/
+    cp -P $(get_install_dir hidapi)/usr/lib/libhidapi-hidraw.so* ${ADDON_BUILD}/${PKG_ADDON_ID}/lib
+}
diff --git a/packages/addons/tools/flirc_util/source/default.py b/packages/addons/tools/flirc_util/source/default.py
new file mode 100644
index 0000000000..e833d8f43e
--- /dev/null
+++ b/packages/addons/tools/flirc_util/source/default.py
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
+
+import xbmcgui
+import subprocess
+
+xbmcgui.Dialog().ok('', 'This is a console-only addon')
+
diff --git a/packages/addons/tools/multimedia-tools/changelog.txt b/packages/addons/tools/multimedia-tools/changelog.txt
index 51b810aade..d609040653 100644
--- a/packages/addons/tools/multimedia-tools/changelog.txt
+++ b/packages/addons/tools/multimedia-tools/changelog.txt
@@ -1,3 +1,9 @@
+2
+- libmediainfo: update to 23.07
+- libzen: update to 0.4.41
+- mediainfo: update to 23.07
+- mpv-drmprime: update to 0.36.0
+
 1
 - mpg123: update to 1.31.3
 - mpv-drmprime: update to 0.35.1
diff --git a/packages/addons/tools/multimedia-tools/package.mk b/packages/addons/tools/multimedia-tools/package.mk
index 9b39cb0ffb..5002a1df58 100644
--- a/packages/addons/tools/multimedia-tools/package.mk
+++ b/packages/addons/tools/multimedia-tools/package.mk
@@ -3,7 +3,7 @@
 
 PKG_NAME="multimedia-tools"
 PKG_VERSION="1.0"
-PKG_REV="1"
+PKG_REV="2"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://libreelec.tv"
diff --git a/packages/addons/tools/system-tools/changelog.txt b/packages/addons/tools/system-tools/changelog.txt
index 9863d92238..fd1e9a7826 100644
--- a/packages/addons/tools/system-tools/changelog.txt
+++ b/packages/addons/tools/system-tools/changelog.txt
@@ -1,3 +1,11 @@
+4
+- bottom: update to 0.9.4
+- libmtp: update to 1.1.21
+- mc: update to 4.8.30
+- screen: update to 4.9.1
+- smartmontools: update to 7.4
+- stress-ng: update to 0.16.04
+
 3
 - file: update to 5.45
 - libssh2: dont build shared library
diff --git a/packages/addons/tools/system-tools/package.mk b/packages/addons/tools/system-tools/package.mk
index f7c2397afc..13e8dc4efe 100644
--- a/packages/addons/tools/system-tools/package.mk
+++ b/packages/addons/tools/system-tools/package.mk
@@ -3,7 +3,7 @@
 
 PKG_NAME="system-tools"
 PKG_VERSION="1.0"
-PKG_REV="3"
+PKG_REV="4"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://libreelec.tv"
diff --git a/packages/graphics/bcm2835-driver/package.mk b/packages/graphics/bcm2835-driver/package.mk
index 105de7f03d..7676ae51f5 100644
--- a/packages/graphics/bcm2835-driver/package.mk
+++ b/packages/graphics/bcm2835-driver/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="bcm2835-driver"
-PKG_VERSION="543692d23dff7075915bc9c7e34abb3fe28e1c46"
-PKG_SHA256="838aa79b842fc10030c6a8b8d7f8dc6b6b08ed7ac762a5488d3d64f1bc51a4ec"
+PKG_VERSION="fdb9eafae4b83e553593937eae8e77b0193903c3"
+PKG_SHA256="ae590baa29a507fa50b5beae46643519a2d2e012945668cfc7393f9275793c78"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
 PKG_URL="${DISTRO_SRC}/${PKG_NAME}-${PKG_VERSION}.tar.xz"
diff --git a/packages/graphics/mesa/package.mk b/packages/graphics/mesa/package.mk
index 6fec8873f8..5bca413245 100644
--- a/packages/graphics/mesa/package.mk
+++ b/packages/graphics/mesa/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="mesa"
-PKG_VERSION="23.1.7"
-PKG_SHA256="409641eadf0ed1c7794797a6f5a0b0195b5580b282166e5ec5629c6bcda6acd3"
+PKG_VERSION="23.2.1"
+PKG_SHA256="64de0616fc2d801f929ab1ac2a4f16b3e2783c4309a724c8a259b20df8bbc1cc"
 PKG_LICENSE="OSS"
 PKG_SITE="http://www.mesa3d.org/"
 PKG_URL="https://mesa.freedesktop.org/archive/mesa-${PKG_VERSION}.tar.xz"
@@ -31,6 +31,12 @@ PKG_MESON_OPTS_TARGET="-Dgallium-drivers=${GALLIUM_DRIVERS// /,} \
                        -Dselinux=false \
                        -Dosmesa=false"
 
+if [ "${DEVICE}" = "RPi5" ]; then
+  PKG_MESON_OPTS_TARGET+=" -Ddraw-use-llvm=false"
+else
+  PKG_MESON_OPTS_TARGET+=" -Ddri-drivers="
+fi
+
 if [ "${DISPLAYSERVER}" = "x11" ]; then
   PKG_DEPENDS_TARGET+=" xorgproto libXext libXdamage libXfixes libXxf86vm libxcb libX11 libxshmfence libXrandr"
   export X11_INCLUDES=
diff --git a/packages/linux/package.mk b/packages/linux/package.mk
index 6241e5bec6..1e7c5905b3 100644
--- a/packages/linux/package.mk
+++ b/packages/linux/package.mk
@@ -23,8 +23,8 @@ case "${LINUX}" in
     PKG_PATCH_DIRS="default"
     ;;
   raspberrypi)
-    PKG_VERSION="431319d91b8584c9f28b195ab9a97d7e78905aeb" # 6.1.42
-    PKG_SHA256="1608f0741e02416e9af9cd0232e21a60c33db94dd0681d8f26909c2a7216038c"
+    PKG_VERSION="3fdb0eb8be5803a16fc308b8441cdcdeafbb944e" # 6.1.62
+    PKG_SHA256="a44b7d642fc2d80a8cfd05d0b461ecbff019680932754b3f9190bb22be9b8fe2"
     PKG_URL="https://github.com/raspberrypi/linux/archive/${PKG_VERSION}.tar.gz"
     PKG_SOURCE_NAME="linux-${LINUX}-${PKG_VERSION}.tar.gz"
     ;;
@@ -45,8 +45,8 @@ case "${LINUX}" in
    PKG_GIT_CLONE_BRANCH="sdm845-5.19.16"
    ;;
   *)
-    PKG_VERSION="6.1.42"
-    PKG_SHA256="aaf8261b551c8b76b81eab8780b446e88cea4d551ae517ac3a9b2dbdbd381ed3"
+    PKG_VERSION="6.1.58"
+    PKG_SHA256="ce987ed3d2f640b3a2a62a0a8573d538a36dfd3cc31e2d7a239ce5a16c1c21ad"
     PKG_URL="https://www.kernel.org/pub/linux/kernel/v${PKG_VERSION/.*/}.x/${PKG_NAME}-${PKG_VERSION}.tar.xz"
     PKG_PATCH_DIRS="default"
     ;;
diff --git a/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch b/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch
new file mode 100644
index 0000000000..3349cf4e1a
--- /dev/null
+++ b/packages/linux/patches/default/linux-022-ASoC-hdmi-codec-Fix-broken-channel-map-reporting.patch
@@ -0,0 +1,57 @@
+From 5e4400b24fc1f8ad41bccb6a6bdb54b961526556 Mon Sep 17 00:00:00 2001
+From: Matthias Reichl <hias@horus.com>
+Date: Thu, 7 Sep 2023 20:33:25 +0200
+Subject: [PATCH] ASoC: hdmi-codec: Fix broken channel map reporting
+
+Commit 4e0871333661 ("ASoC: hdmi-codec: fix channel info for
+compressed formats") accidentally changed hcp->chmap_idx from
+ca_id, the CEA channel allocation ID, to idx, the index to
+the table of channel mappings ordered by preference.
+
+This resulted in wrong channel maps being reported to userspace,
+eg for 5.1 "FL,FR,LFE,FC" was reported instead of the expected
+"FL,FR,LFE,FC,RL,RR":
+
+~ # speaker-test -c 6 -t sine
+...
+ 0 - Front Left
+ 3 - Front Center
+ 1 - Front Right
+ 2 - LFE
+ 4 - Unknown
+ 5 - Unknown
+
+~ # amixer cget iface=PCM,name='Playback Channel Map' | grep ': values'
+  : values=3,4,8,7,0,0,0,0
+
+Switch this back to ca_id in case of PCM audio so the correct channel
+map is reported again and set it to HDMI_CODEC_CHMAP_IDX_UNKNOWN in
+case of non-PCM audio so the PCM channel map control returns "Unknown"
+channels (value 0).
+
+Fixes: 4e0871333661 ("ASoC: hdmi-codec: fix channel info for compressed formats")
+Cc: stable@vger.kernel.org
+Signed-off-by: Matthias Reichl <hias@horus.com>
+---
+ sound/soc/codecs/hdmi-codec.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/sound/soc/codecs/hdmi-codec.c b/sound/soc/codecs/hdmi-codec.c
+index 13689e718d36f..09eef6042aad6 100644
+--- a/sound/soc/codecs/hdmi-codec.c
++++ b/sound/soc/codecs/hdmi-codec.c
+@@ -531,7 +531,10 @@ static int hdmi_codec_fill_codec_params(struct snd_soc_dai *dai,
+ 	hp->sample_rate = sample_rate;
+ 	hp->channels = channels;
+ 
+-	hcp->chmap_idx = idx;
++	if (pcm_audio)
++		hcp->chmap_idx = ca_id;
++	else
++		hcp->chmap_idx = HDMI_CODEC_CHMAP_IDX_UNKNOWN;
+ 
+ 	return 0;
+ }
+-- 
+2.39.2
+
diff --git a/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch b/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch
deleted file mode 100644
index 677de3ed7f..0000000000
--- a/packages/linux/patches/raspberrypi/linux-062-imon_pad_ignore_diagonal.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff -Naur linux-3.16.1/drivers/media/rc/imon.c linux-3.16.1.patch/drivers/media/rc/imon.c
---- linux-3.16.1/drivers/media/rc/imon.c	2014-08-14 04:36:35.000000000 +0200
-+++ linux-3.16.1.patch/drivers/media/rc/imon.c	2014-08-15 13:57:16.587620642 +0200
-@@ -1344,6 +1344,17 @@
- 			}
- 		} else {
- 			/*
-+			 * For users without stabilized, just ignore any value getting
-+			 * to close to the diagonal.
-+			 */
-+			if ((abs(rel_y) < 2 && abs(rel_x) < 2) ||
-+				abs(abs(rel_y) - abs(rel_x)) < 2 ) {
-+				spin_lock_irqsave(&ictx->kc_lock, flags);
-+				ictx->kc = KEY_UNKNOWN;
-+				spin_unlock_irqrestore(&ictx->kc_lock, flags);
-+				return;
-+			}
-+			/*
- 			 * Hack alert: instead of using keycodes, we have
- 			 * to use hard-coded scancodes here...
- 			 */
diff --git a/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch b/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch
deleted file mode 100644
index 16ac49bee6..0000000000
--- a/packages/linux/patches/raspberrypi/linux-999.02-0001-pm-disable-async-suspend-resume-by-default.patch
+++ /dev/null
@@ -1,25 +0,0 @@
-From c314d9af9d774c052bea324e1a140ccdba0ca070 Mon Sep 17 00:00:00 2001
-From: Stefan Saraev <stefan@saraev.ca>
-Date: Tue, 8 Apr 2014 14:02:53 +0300
-Subject: [PATCH] pm: disable async suspend/resume by default
-
----
- kernel/power/main.c |    2 +-
- 1 files changed, 1 insertions(+), 1 deletions(-)
-
-diff --git a/kernel/power/main.c b/kernel/power/main.c
-index 1d1bf63..361db93 100644
---- a/kernel/power/main.c
-+++ b/kernel/power/main.c
-@@ -46,7 +46,7 @@ int pm_notifier_call_chain(unsigned long val)
- }
- 
- /* If set, devices may be suspended and resumed asynchronously. */
--int pm_async_enabled = 1;
-+int pm_async_enabled = 0;
- 
- static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
- 			     char *buf)
--- 
-1.7.2.5
-
diff --git a/packages/mediacenter/LibreELEC-settings/package.mk b/packages/mediacenter/LibreELEC-settings/package.mk
index c17a1f41d5..3e8f0e16d8 100644
--- a/packages/mediacenter/LibreELEC-settings/package.mk
+++ b/packages/mediacenter/LibreELEC-settings/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="LibreELEC-settings"
-PKG_VERSION="9a334c0857fe5ccf84af272f42dc6f6cd5c72e4b"
-PKG_SHA256="71be076033ae4bcb9012a12c2fc47b0805b0e40db2e812e19613643bbcba978c"
+PKG_VERSION="b920d5d83a8a7445d121d2f920169444111bf93c"
+PKG_SHA256="d8147068b6172250d98d41fafd7d6dbaa286074932b537214bf0dab95fe9e99a"
 PKG_LICENSE="GPL"
 PKG_SITE="https://libreelec.tv"
 PKG_URL="https://github.com/LibreELEC/service.libreelec.settings/archive/${PKG_VERSION}.tar.gz"
diff --git a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
index f0858dcf47..0d3068821e 100644
--- a/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/inputstream.adaptive/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="inputstream.adaptive"
-PKG_VERSION="20.3.11-Nexus"
-PKG_SHA256="ed266d2a51efcd0952cfacc8549350282dce07f7c0e885eeb41d662f123e12a6"
+PKG_VERSION="20.3.14-Nexus"
+PKG_SHA256="59573a0d97bb665e0ada35b44f77e9bf9232adc669d0d44beccf727145d36aff"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
index 4ad6338385..a9c24da1b1 100644
--- a/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/peripheral.joystick/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="peripheral.joystick"
-PKG_VERSION="20.1.10-Nexus"
-PKG_SHA256="b72277358df77ed79a0e7f3ae7e9799d02692fb30408cf6e5325ce7e5a34f597"
+PKG_VERSION="20.1.13-Nexus"
+PKG_SHA256="9fabd0cbb54f6f4acfa16a5fa2c13e37d121bb0774e0b62ad3604a72c99b95e5"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
index 226a0bc8d8..ad59e7fd5b 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.hts/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.hts"
-PKG_VERSION="20.6.2-Nexus"
-PKG_SHA256="e77bd87f1f4d1abc06e32d0347a0bb635bc129bec43a07864cb8a9e6b0e4d374"
+PKG_VERSION="20.6.5-Nexus"
+PKG_SHA256="bd58fc85543447f918ee567192c87a3beb3f6e2c3fc116abe1f584514d202ada"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
index 2a65a47d54..8f22e1c472 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.iptvsimple/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.iptvsimple"
-PKG_VERSION="20.11.0-Nexus"
-PKG_SHA256="c2014b11dd928a1d4789279d7f3ce25af8af4047194e8406f9dfd99d16fe2ee2"
+PKG_VERSION="20.13.0-Nexus"
+PKG_SHA256="9edf800d7d5e755c92e9e8f6d3771a74cf3fec23b3aaec5b8535f1a579941a5b"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
index 46eaf927f6..bb90b2652d 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.mythtv/package.mk
@@ -3,9 +3,9 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.mythtv"
-PKG_VERSION="20.3.2-Nexus"
-PKG_SHA256="b1ad428bec882d3e852240cbef2378803635b530545a08421ff3baf0611a29e7"
-PKG_REV="2"
+PKG_VERSION="20.5.6-Nexus"
+PKG_SHA256="321559f9f46a2588bdcfe9be6d6e7439911e548a92e7820dedde6cabccbe72fd"
+PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://github.com/janbar/pvr.mythtv"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
index 299412a646..37606931c9 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.nextpvr/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.nextpvr"
-PKG_VERSION="20.4.2-Nexus"
-PKG_SHA256="1ce85447426ddf6d443a5e3444145a2d3af65ce73d9fb583e42cd8afc9d599a5"
+PKG_VERSION="20.4.3-Nexus"
+PKG_SHA256="752dff532a277797f3fefc1ced7fea6efb8d92982d9040c4080c1e6dbab203a0"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk
index 95b5ecfe32..5b99456089 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.plutotv/package.mk
@@ -2,9 +2,9 @@
 # Copyright (C) 2021-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.plutotv"
-PKG_VERSION="20.3.0-Nexus"
-PKG_SHA256="d38a6bf4debc442849d01faedadcccb1b07debe850cd3c9a5789508233d22256"
-PKG_REV="5"
+PKG_VERSION="20.3.1a-Nexus"
+PKG_SHA256="11505556200029a48a293e97f94e0469a5f78580d0e56d5d0a1da05d61f0f5b5"
+PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
 PKG_SITE="https://github.com/kodi-pvr/pvr.plutotv"
diff --git a/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk b/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk
index 5a9aeb7b8f..42a2278b09 100644
--- a/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk
+++ b/packages/mediacenter/kodi-binary-addons/pvr.waipu/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="pvr.waipu"
-PKG_VERSION="20.9.0-Nexus"
-PKG_SHA256="3a0a1dffe1bb8711dd6747b02a51aee6a0bce40ca6822cc2ab7f04b4b5acb82f"
+PKG_VERSION="20.10.2-Nexus"
+PKG_SHA256="2326c3ed0e57ef8020c1041ea6f25fa325c619588c24c71b5963d8dda1c97604"
 PKG_REV="1"
 PKG_ARCH="any"
 PKG_LICENSE="GPL"
diff --git a/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules b/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules
new file mode 100644
index 0000000000..775d814c1f
--- /dev/null
+++ b/packages/mediacenter/kodi/config/70-libinput-ignore-power-button.rules
@@ -0,0 +1,15 @@
+# Ignore power button input devices in libinput so logind can handle them
+ACTION=="remove", GOTO="end"
+SUBSYSTEM!="input", GOTO="end"
+KERNEL!="event*", GOTO="end"
+
+IMPORT{parent}="KEY"
+
+# match devices that only generate KEY_POWER (code 116) events
+ENV{KEY}=="10000000000000 0", ENV{LIBINPUT_IGNORE_DEVICE}="1"
+
+# 32bit systems report the bitmap in 32bit chunks
+ENV{KEY}=="100000 0 0 0", ENV{LIBINPUT_IGNORE_DEVICE}="1"
+
+LABEL="end"
+
diff --git a/packages/mediacenter/kodi/package.mk b/packages/mediacenter/kodi/package.mk
index 23236911d8..b94b348c21 100644
--- a/packages/mediacenter/kodi/package.mk
+++ b/packages/mediacenter/kodi/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="kodi"
-PKG_VERSION="20.2-Nexus"
-PKG_SHA256="4e81abf81172812bc8891f69a7a80a2b846298cecaae7b5009725e28a3040c23"
+PKG_VERSION="618d1e35d89f1c49c2a37f5d233319f3f3bbe01b"
+PKG_SHA256="2831ca6c004dde11105c14337b58e89b24cd735596510f1ab85d15f6234a835c"
 PKG_LICENSE="GPL"
 PKG_SITE="http://www.kodi.tv"
 PKG_URL="https://github.com/xbmc/xbmc/archive/${PKG_VERSION}.tar.gz"
@@ -408,6 +408,12 @@ post_makeinstall_target() {
   mkdir -p ${INSTALL}/usr/cache/libreelec
     cp ${PKG_DIR}/config/network_wait ${INSTALL}/usr/cache/libreelec
 
+  # GBM: install udev rule to ignore the power button in libinput/kodi so logind can handle it
+  if [ "${DISPLAYSERVER}" = "no" ]; then
+    mkdir -p ${INSTALL}/usr/lib/udev/rules.d/
+    cp ${PKG_DIR}/config/70-libinput-ignore-power-button.rules ${INSTALL}/usr/lib/udev/rules.d/
+  fi
+
   # update addon manifest
   ADDON_MANIFEST=${INSTALL}/usr/share/kodi/system/addon-manifest.xml
   xmlstarlet ed -L -d "/addons/addon[text()='service.xbmc.versioncheck']" ${ADDON_MANIFEST}
diff --git a/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch b/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch
new file mode 100644
index 0000000000..fe611fb14f
--- /dev/null
+++ b/packages/mediacenter/kodi/patches/kodi-995.21-keymaps-change-remote-poweroff-action-to-show-shutdo.patch
@@ -0,0 +1,26 @@
+From 5604be6a6701e0bd68cb36fadb05cecba57f7887 Mon Sep 17 00:00:00 2001
+From: Matthias Reichl <hias@horus.com>
+Date: Fri, 22 Sep 2023 23:41:51 +0200
+Subject: [PATCH] keymaps: change remote poweroff action to show shutdown menu
+
+Signed-off-by: Matthias Reichl <hias@horus.com>
+---
+ system/keymaps/remote.xml | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/system/keymaps/remote.xml b/system/keymaps/remote.xml
+index c122b99188..baebf679c0 100644
+--- a/system/keymaps/remote.xml
++++ b/system/keymaps/remote.xml
+@@ -50,7 +50,7 @@
+       <volumeplus>VolumeUp</volumeplus>
+       <volumeminus>VolumeDown</volumeminus>
+       <mute>Mute</mute>
+-      <power>ShutDown()</power>
++      <power>ActivateWindow(ShutdownMenu)</power>
+       <myvideo>ActivateWindow(Videos)</myvideo>
+       <mymusic>ActivateWindow(Music)</mymusic>
+       <mypictures>ActivateWindow(Pictures)</mypictures>
+-- 
+2.39.2
+
diff --git a/packages/multimedia/ffmpeg/package.mk b/packages/multimedia/ffmpeg/package.mk
index 80a7390f88..3c155232c5 100644
--- a/packages/multimedia/ffmpeg/package.mk
+++ b/packages/multimedia/ffmpeg/package.mk
@@ -64,6 +64,12 @@ if [ "${V4L2_SUPPORT}" = "yes" -a ! "${DEVICE}" = "Switch" ]; then
 
   if [ "${PROJECT}" = "Allwinner" -o "${PROJECT}" = "Rockchip" -o "${DEVICE}" = "iMX8" -o "${DEVICE}" = "RPi4" ]; then
     PKG_V4L2_REQUEST="yes"
+  elif [ "${PROJECT}" = "RPi" ] && [ "${DEVICE}" = "RPi4" -o "${DEVICE}" = "RPi5" ]; then
+    PKG_V4L2_REQUEST="yes"
+    PKG_FFMPEG_HWACCEL="--disable-hwaccel=h264_v4l2request \
+                        --disable-hwaccel=mpeg2_v4l2request \
+                        --disable-hwaccel=vp8_v4l2request \
+                        --disable-hwaccel=vp9_v4l2request"
   else
     PKG_V4L2_REQUEST="no"
   fi
diff --git a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
index 27c1326476..cd84890c43 100755
--- a/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
+++ b/packages/multimedia/ffmpeg/patches/rpi/ffmpeg-001-rpi.patch
@@ -1,53 +1,70710 @@
-From 504df93cfe5416b394755e79b7b81ee0119cf09c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 26 Apr 2021 12:34:50 +0100
-Subject: [PATCH 001/136] Add pi configs and scripts
-
----
- pi-util/BUILD.txt                  |  59 ++++++++
- pi-util/NOTES.txt                  |  69 +++++++++
- pi-util/TESTMESA.txt               |  82 +++++++++++
- pi-util/clean_usr_libs.sh          |  26 ++++
- pi-util/conf_arm64_native.sh       |  45 ++++++
- pi-util/conf_h265.2016.csv         | 195 ++++++++++++++++++++++++++
- pi-util/conf_h265.2016_HEVC_v1.csv | 147 ++++++++++++++++++++
- pi-util/conf_h265.csv              | 144 +++++++++++++++++++
- pi-util/conf_native.sh             | 108 +++++++++++++++
- pi-util/ffconf.py                  | 215 +++++++++++++++++++++++++++++
- pi-util/ffperf.py                  | 128 +++++++++++++++++
- pi-util/genpatch.sh                |  35 +++++
- pi-util/make_array.py              |  23 +++
- pi-util/mkinst.sh                  |   5 +
- pi-util/patkodi.sh                 |   9 ++
- pi-util/perfcmp.py                 | 101 ++++++++++++++
- pi-util/qem.sh                     |   9 ++
- pi-util/v3dusage.py                | 128 +++++++++++++++++
- 18 files changed, 1528 insertions(+)
- create mode 100644 pi-util/BUILD.txt
- create mode 100644 pi-util/NOTES.txt
- create mode 100644 pi-util/TESTMESA.txt
- create mode 100755 pi-util/clean_usr_libs.sh
- create mode 100644 pi-util/conf_arm64_native.sh
- create mode 100644 pi-util/conf_h265.2016.csv
- create mode 100644 pi-util/conf_h265.2016_HEVC_v1.csv
- create mode 100644 pi-util/conf_h265.csv
- create mode 100755 pi-util/conf_native.sh
- create mode 100755 pi-util/ffconf.py
- create mode 100755 pi-util/ffperf.py
- create mode 100755 pi-util/genpatch.sh
- create mode 100755 pi-util/make_array.py
- create mode 100755 pi-util/mkinst.sh
- create mode 100644 pi-util/patkodi.sh
- create mode 100755 pi-util/perfcmp.py
- create mode 100755 pi-util/qem.sh
- create mode 100755 pi-util/v3dusage.py
-
+diff --git a/configure b/configure
+index 4ba72bf84b..f2fc33e89b 100755
+--- a/configure
++++ b/configure
+@@ -207,6 +207,7 @@ External library support:
+   --disable-bzlib          disable bzlib [autodetect]
+   --disable-coreimage      disable Apple CoreImage framework [autodetect]
+   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
++  --disable-epoxy          disable epoxy [autodetect]
+   --enable-frei0r          enable frei0r video filtering [no]
+   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
+                            if openssl, librtmp or gmp is not used [no]
+@@ -279,6 +280,7 @@ External library support:
+                            if openssl, gnutls or mbedtls is not used [no]
+   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
+   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
++  --disable-libudev        disable libudev [autodetect]
+   --enable-libv4l2         enable libv4l2/v4l-utils [no]
+   --enable-libvidstab      enable video stabilization using vid.stab [no]
+   --enable-libvmaf         enable vmaf filter via libvmaf [no]
+@@ -340,12 +342,17 @@ External library support:
+   --enable-libmfx          enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
+   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
+   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
++  --enable-rpi             enable other rpi specific stuff [no]
++  --enable-sand            enable sand video formats [rpi]
++  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
++  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
+   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
+   --disable-nvenc          disable Nvidia video encoding code [autodetect]
+   --enable-omx             enable OpenMAX IL code [no]
+   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
+   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
+   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
++  --enable-v4l2-request    enable V4L2 request API code [no]
+   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
+   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
+   --disable-videotoolbox   disable VideoToolbox code [autodetect]
+@@ -1703,7 +1710,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
+     avfoundation
+     bzlib
+     coreimage
++    epoxy
+     iconv
++    libudev
+     libxcb
+     libxcb_shm
+     libxcb_shape
+@@ -1868,7 +1877,10 @@ HWACCEL_LIBRARY_LIST="
+     mmal
+     omx
+     opencl
++    v4l2_request
+     vulkan
++    rpi4_8
++    rpi4_10
+ "
+ 
+ DOCUMENT_LIST="
+@@ -1884,12 +1896,17 @@ FEATURE_LIST="
+     gray
+     hardcoded_tables
+     omx_rpi
++    rpi
+     runtime_cpudetect
+     safe_bitstream_reader
++    sand
+     shared
+     small
+     static
+     swscale_alpha
++    vout_drm
++    vout_egl
++    v4l2_req_hevc_vx
+ "
+ 
+ # this list should be kept in linking order
+@@ -1930,6 +1947,7 @@ SUBSYSTEM_LIST="
+     pixelutils
+     network
+     rdft
++    rpi
+ "
+ 
+ # COMPONENT_LIST needs to come last to ensure correct dependency checking
+@@ -2416,9 +2434,11 @@ CONFIG_EXTRA="
+     rangecoder
+     riffdec
+     riffenc
++    rpi
+     rtpdec
+     rtpenc_chain
+     rv34dsp
++    sand
+     scene_sad
+     sinewin
+     snappy
+@@ -2750,6 +2770,8 @@ hap_decoder_select="snappy texturedsp"
+ hap_encoder_deps="libsnappy"
+ hap_encoder_select="texturedspenc"
+ hevc_decoder_select="atsc_a53 bswapdsp cabac golomb hevcparse videodsp"
++hevc_rpi_decoder_deps="rpi"
++hevc_rpi_decoder_select="hevc_decoder sand"
+ huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
+ huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
+ hymt_decoder_select="huffyuv_decoder"
+@@ -2920,6 +2942,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
+ dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
+ ffnvcodec_deps_any="libdl LoadLibrary"
+ nvdec_deps="ffnvcodec"
++v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
+ vaapi_x11_deps="xlib"
+ videotoolbox_hwaccel_deps="videotoolbox pthreads"
+ videotoolbox_hwaccel_extralibs="-framework QuartzCore"
+@@ -2961,6 +2984,12 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
+ hevc_dxva2_hwaccel_select="hevc_decoder"
+ hevc_nvdec_hwaccel_deps="nvdec"
+ hevc_nvdec_hwaccel_select="hevc_decoder"
++hevc_v4l2request_hwaccel_deps="v4l2_request"
++hevc_v4l2request_hwaccel_select="hevc_decoder"
++hevc_rpi4_10_hwaccel_deps="rpi"
++hevc_rpi4_10_hwaccel_select="hevc_decoder"
++hevc_rpi4_8_hwaccel_deps="rpi"
++hevc_rpi4_8_hwaccel_select="hevc_decoder"
+ hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
+ hevc_vaapi_hwaccel_select="hevc_decoder"
+ hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
+@@ -3438,8 +3467,13 @@ sndio_indev_deps="sndio"
+ sndio_outdev_deps="sndio"
+ v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_indev_suggest="libv4l2"
++v4l2_outdev_deps="libdrm"
+ v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
+ v4l2_outdev_suggest="libv4l2"
++vout_drm_outdev_deps="libdrm"
++vout_egl_outdev_deps="xlib epoxy"
++vout_rpi_outdev_deps="rpi"
++vout_rpi_outdev_select="sand"
+ vfwcap_indev_deps="vfw32 vfwcap_defines"
+ xcbgrab_indev_deps="libxcb"
+ xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
+@@ -3658,6 +3692,7 @@ tonemap_vaapi_filter_deps="vaapi VAProcFilterParameterBufferHDRToneMapping"
+ tonemap_opencl_filter_deps="opencl const_nan"
+ transpose_opencl_filter_deps="opencl"
+ transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
++unsand_filter_select="sand"
+ unsharp_opencl_filter_deps="opencl"
+ uspp_filter_deps="gpl avcodec"
+ vaguedenoiser_filter_deps="gpl"
+@@ -6155,6 +6190,12 @@ check_func_headers glob.h glob
+ enabled xlib &&
+     check_lib xlib "X11/Xlib.h X11/extensions/Xvlib.h" XvGetPortAttribute -lXv -lX11 -lXext
+ 
++enabled libudev &&
++    check_pkg_config libudev libudev libudev.h udev_new
++
++enabled epoxy &&
++    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
++
+ check_headers direct.h
+ check_headers dirent.h
+ check_headers dxgidebug.h
+@@ -6492,11 +6533,12 @@ enabled mbedtls           && { check_pkg_config mbedtls mbedtls mbedtls/x509_crt
+                                check_lib mbedtls mbedtls/ssl.h mbedtls_ssl_init -lmbedtls -lmbedx509 -lmbedcrypto ||
+                                die "ERROR: mbedTLS not found"; }
+ enabled mediacodec        && { enabled jni || die "ERROR: mediacodec requires --enable-jni"; }
+-enabled mmal              && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
++( enabled rpi ||
++  enabled mmal )          && { check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host ||
+                                { ! enabled cross_compile &&
+                                  add_cflags -isystem/opt/vc/include/ -isystem/opt/vc/include/interface/vmcs_host/linux -isystem/opt/vc/include/interface/vcos/pthreads -fgnu89-inline &&
+                                  add_ldflags -L/opt/vc/lib/ &&
+-                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host; } ||
++                                 check_lib mmal interface/mmal/mmal.h mmal_port_connect -lmmal_core -lmmal_util -lmmal_vc_client -lbcm_host -lvcos -lvcsm -lvchostif -lvchiq_arm; } ||
+                                die "ERROR: mmal not found" &&
+                                check_func_headers interface/mmal/mmal.h "MMAL_PARAMETER_VIDEO_MAX_NUM_CALLBACKS"; }
+ enabled openal            && { { for al_extralibs in "${OPENAL_LIBS}" "-lopenal" "-lOpenAL32"; do
+@@ -6537,8 +6579,16 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
+                                { enabled libdrm ||
+                                  die "ERROR: rkmpp requires --enable-libdrm"; }
+                              }
++enabled v4l2_request      && { enabled libdrm ||
++                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
++                             { enabled libudev ||
++                               die "ERROR: v4l2-request requires libudev"; }
+ enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
+ 
++enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
++
++enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
++                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
+ 
+ if enabled gcrypt; then
+     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
+@@ -6618,6 +6668,10 @@ if enabled v4l2_m2m; then
+     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
+ fi
+ 
++check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
++check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
++disable v4l2_req_hevc_vx
++
+ check_headers sys/videoio.h
+ test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
+ 
+@@ -7105,6 +7159,9 @@ check_deps $CONFIG_LIST       \
+ enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"
+ enabled avresample && warn "Building with deprecated library libavresample"
+ 
++# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
++enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
++
+ case $target_os in
+ haiku)
+     disable memalign
+diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
+index dec012a299..8aa13007f9 100644
+--- a/fftools/ffmpeg.c
++++ b/fftools/ffmpeg.c
+@@ -2189,8 +2189,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
+                        ifilter->channel_layout != frame->channel_layout;
+         break;
+     case AVMEDIA_TYPE_VIDEO:
+-        need_reinit |= ifilter->width  != frame->width ||
+-                       ifilter->height != frame->height;
++        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
++                       ifilter->height != av_frame_cropped_height(frame);
+         break;
+     }
+ 
+@@ -2201,6 +2201,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
+         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
+         need_reinit = 1;
+ 
++    if (no_cvt_hw && fg->graph)
++        need_reinit = 0;
++
+     if (need_reinit) {
+         ret = ifilter_parameters_from_frame(ifilter, frame);
+         if (ret < 0)
+@@ -2469,8 +2472,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
+         decoded_frame->top_field_first = ist->top_field_first;
+ 
+     ist->frames_decoded++;
+-
+-    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
++    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
+         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
+         if (err < 0)
+             goto fail;
+@@ -2674,7 +2676,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
+         case AVMEDIA_TYPE_VIDEO:
+             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
+                                    &decode_failed);
+-            if (!repeating || !pkt || got_output) {
++            // Pi: Do not inc dts if no_cvt_hw set
++            // V4L2 H264 decode has long latency and sometimes spits out a long
++            // stream of output without input. In this case incrementing DTS is wrong.
++            // There may be cases where the condition as written is correct so only
++            // "fix" in the cases which cause problems
++            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
+                 if (pkt && pkt->duration) {
+                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
+                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
+@@ -2898,6 +2905,16 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
+         } else {
+             const HWAccel *hwaccel = NULL;
+             int i;
++
++            if (no_cvt_hw) {
++                config = avcodec_get_hw_config(s->codec, 0);
++                if (config->methods == AV_CODEC_HW_CONFIG_METHOD_INTERNAL) {
++                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so accepting pix_fmt %d with codec internal hwaccel\n", *p);
++                    ist->hwaccel_pix_fmt = *p;
++                    break;
++                }
++            }
++
+             for (i = 0; hwaccels[i].name; i++) {
+                 if (hwaccels[i].pix_fmt == *p) {
+                     hwaccel = &hwaccels[i];
+@@ -2993,6 +3010,15 @@ static int init_input_stream(int ist_index, char *error, int error_len)
+             return ret;
+         }
+ 
++#if CONFIG_HEVC_RPI_DECODER
++        ret = -1;
++        if (strcmp(codec->name, "hevc_rpi") == 0 &&
++            (ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
++            ist->dec = codec = avcodec_find_decoder_by_name("hevc");
++            av_log(NULL, AV_LOG_INFO, "Failed to open hevc_rpi - trying hevc\n");
++        }
++        if (ret < 0)
++#endif
+         if ((ret = avcodec_open2(ist->dec_ctx, codec, &ist->decoder_opts)) < 0) {
+             if (ret == AVERROR_EXPERIMENTAL)
+                 abort_codec_experimental(codec, 0);
+diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
+index 606f2afe0c..448cd2e009 100644
+--- a/fftools/ffmpeg.h
++++ b/fftools/ffmpeg.h
+@@ -61,6 +61,7 @@ enum HWAccelID {
+     HWACCEL_GENERIC,
+     HWACCEL_VIDEOTOOLBOX,
+     HWACCEL_QSV,
++    HWACCEL_RPI,
+ };
+ 
+ typedef struct HWAccel {
+@@ -611,6 +612,7 @@ extern int video_sync_method;
+ extern float frame_drop_threshold;
+ extern int do_benchmark;
+ extern int do_benchmark_all;
++extern int no_cvt_hw;
+ extern int do_deinterlace;
+ extern int do_hex_dump;
+ extern int do_pkt_dump;
+diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
+index 4ab769c07b..5cdc3a7b6c 100644
+--- a/fftools/ffmpeg_filter.c
++++ b/fftools/ffmpeg_filter.c
+@@ -1160,8 +1160,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
+ 
+     ifilter->format = frame->format;
+ 
+-    ifilter->width               = frame->width;
+-    ifilter->height              = frame->height;
++    ifilter->width               = av_frame_cropped_width(frame);
++    ifilter->height              = av_frame_cropped_height(frame);
+     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
+ 
+     ifilter->sample_rate         = frame->sample_rate;
+diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
+index fc4a5d31d6..cc69dce40e 100644
+--- a/fftools/ffmpeg_hw.c
++++ b/fftools/ffmpeg_hw.c
+@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
+     char *name;
+     size_t index_pos;
+     int index, index_limit = 1000;
++    if (!type_name)
++        return NULL;
+     index_pos = strlen(type_name);
+     name = av_malloc(index_pos + 4);
+     if (!name)
+diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
+index 807e783422..456d4f349b 100644
+--- a/fftools/ffmpeg_opt.c
++++ b/fftools/ffmpeg_opt.c
+@@ -133,12 +133,22 @@ static const char *const opt_name_enc_time_bases[]            = {"enc_time_base"
+     }\
+ }
+ 
++#if CONFIG_RPI
++static int rpi_init(AVCodecContext *avctx) {
++    return 0;
++}
++#endif
++
+ const HWAccel hwaccels[] = {
+ #if CONFIG_VIDEOTOOLBOX
+     { "videotoolbox", videotoolbox_init, HWACCEL_VIDEOTOOLBOX, AV_PIX_FMT_VIDEOTOOLBOX },
+ #endif
+ #if CONFIG_LIBMFX
+     { "qsv",   qsv_init,   HWACCEL_QSV,   AV_PIX_FMT_QSV },
++#endif
++#if CONFIG_RPI
++    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_8 },
++    {  "rpi", rpi_init, HWACCEL_RPI, AV_PIX_FMT_RPI4_10 },
+ #endif
+     { 0 },
+ };
+@@ -158,6 +168,7 @@ float frame_drop_threshold = 0;
+ int do_deinterlace    = 0;
+ int do_benchmark      = 0;
+ int do_benchmark_all  = 0;
++int no_cvt_hw         = 0;
+ int do_hex_dump       = 0;
+ int do_pkt_dump       = 0;
+ int copy_ts           = 0;
+@@ -3499,6 +3510,8 @@ const OptionDef options[] = {
+         "add timings for benchmarking" },
+     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
+       "add timings for each task" },
++    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
++      "do not auto-convert hw frames to sw" },
+     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
+       "write program-readable progress information", "url" },
+     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
+diff --git a/libavcodec/Makefile b/libavcodec/Makefile
+index 33a280cf69..e93c842047 100644
+--- a/libavcodec/Makefile
++++ b/libavcodec/Makefile
+@@ -19,6 +19,7 @@ HEADERS = ac3_parser.h                                                  \
+           mediacodec.h                                                  \
+           packet.h                                                      \
+           qsv.h                                                         \
++          rpi_zc.h                                                      \
+           vaapi.h                                                       \
+           vdpau.h                                                       \
+           version.h                                                     \
+@@ -140,6 +141,7 @@ OBJS-$(CONFIG_QSVDEC)                  += qsvdec.o
+ OBJS-$(CONFIG_QSVENC)                  += qsvenc.o
+ OBJS-$(CONFIG_RANGECODER)              += rangecoder.o
+ OBJS-$(CONFIG_RDFT)                    += rdft.o
++OBJS-$(CONFIG_RPI)                     += rpi_qpu.o rpi_mailbox.o rpi_zc.o
+ OBJS-$(CONFIG_RV34DSP)                 += rv34dsp.o
+ OBJS-$(CONFIG_SHARED)                  += log2_tab.o reverse.o
+ OBJS-$(CONFIG_SINEWIN)                 += sinewin.o
+@@ -154,7 +156,10 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
+ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
+ OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
+ OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
+-OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
++OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
++                                          weak_link.o v4l2_req_dmabufs.o
++OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
++					  v4l2_req_devscan.o weak_link.o
+ OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
+ OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
+ 
+@@ -403,6 +408,14 @@ OBJS-$(CONFIG_HEVC_QSV_DECODER)        += qsvdec.o
+ OBJS-$(CONFIG_HEVC_QSV_ENCODER)        += qsvenc_hevc.o hevc_ps_enc.o       \
+                                           hevc_data.o
+ OBJS-$(CONFIG_HEVC_RKMPP_DECODER)      += rkmppdec.o
++OBJS-$(CONFIG_RPI)                     += rpi_mem.o \
++                                          rpi_mailbox.o rpi_zc.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += rpi_hevcdec.o rpi_hevc_mvs.o \
++                                          rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o    \
++                                          rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o    \
++                                          rpi_hevc_shader.o rpi_hevc_shader_template.o       \
++                                          rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
++                                          rpi_hevc_sei.o rpi_hevc_data.o rpi_qpu.o rpi_mem.o
+ OBJS-$(CONFIG_HEVC_VAAPI_ENCODER)      += vaapi_encode_h265.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_DECODER)    += v4l2_m2m_dec.o
+ OBJS-$(CONFIG_HEVC_V4L2M2M_ENCODER)    += v4l2_m2m_enc.o
+@@ -941,6 +954,10 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
+ OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
+ OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
++OBJS-$(CONFIG_HEVC_RPI4_8_HWACCEL)        += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_RPI4_10_HWACCEL)       += rpivid_hevc.o
++OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
++OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
+ OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
+ OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
+@@ -1297,3 +1314,31 @@ $(SUBDIR)pcm.o: $(SUBDIR)pcm_tables.h
+ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
+ $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
+ endif
++
++ifdef CONFIG_HEVC_RPI_DECODER
++QASM_PY := ../local/bin/qasm.py
++VASMVIDCORE := ../local/bin/vasmvidcore_std
++
++ifneq ("$(wildcard $(QASM_PY))","")
++$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++
++$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
++	$(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
++endif
++
++ifneq ("$(wildcard $(VASMVIDCORE))","")
++$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
++$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
++	$(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
++
++$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
++	python pi-util/make_array.py $<
++$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
++	python pi-util/make_array.py $<
++endif
++
++$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
++$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
++endif
+diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
+index 954461f81d..c8935f205e 100644
+--- a/libavcodec/aarch64/Makefile
++++ b/libavcodec/aarch64/Makefile
+@@ -44,10 +44,12 @@ NEON-OBJS-$(CONFIG_H264PRED)            += aarch64/h264pred_neon.o
+ NEON-OBJS-$(CONFIG_H264QPEL)            += aarch64/h264qpel_neon.o             \
+                                            aarch64/hpeldsp_neon.o
+ NEON-OBJS-$(CONFIG_HPELDSP)             += aarch64/hpeldsp_neon.o
+-NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/simple_idct_neon.o
++NEON-OBJS-$(CONFIG_IDCTDSP)             += aarch64/idctdsp_neon.o              \
++                                           aarch64/simple_idct_neon.o
+ NEON-OBJS-$(CONFIG_MDCT)                += aarch64/mdct_neon.o
+ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
+ NEON-OBJS-$(CONFIG_PIXBLOCKDSP)         += aarch64/pixblockdsp_neon.o
++NEON-OBJS-$(CONFIG_VC1DSP)              += aarch64/vc1dsp_neon.o
+ NEON-OBJS-$(CONFIG_VP8DSP)              += aarch64/vp8dsp_neon.o
+ 
+ # decoders/encoders
+diff --git a/libavcodec/aarch64/idctdsp_init_aarch64.c b/libavcodec/aarch64/idctdsp_init_aarch64.c
+index 742a3372e3..eec21aa5a2 100644
+--- a/libavcodec/aarch64/idctdsp_init_aarch64.c
++++ b/libavcodec/aarch64/idctdsp_init_aarch64.c
+@@ -27,19 +27,29 @@
+ #include "libavcodec/idctdsp.h"
+ #include "idct.h"
+ 
++void ff_put_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_put_signed_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++void ff_add_pixels_clamped_neon(const int16_t *, uint8_t *, ptrdiff_t);
++
+ av_cold void ff_idctdsp_init_aarch64(IDCTDSPContext *c, AVCodecContext *avctx,
+                                      unsigned high_bit_depth)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+-    if (have_neon(cpu_flags) && !avctx->lowres && !high_bit_depth) {
+-        if (avctx->idct_algo == FF_IDCT_AUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
+-            avctx->idct_algo == FF_IDCT_SIMPLENEON) {
+-            c->idct_put  = ff_simple_idct_put_neon;
+-            c->idct_add  = ff_simple_idct_add_neon;
+-            c->idct      = ff_simple_idct_neon;
+-            c->perm_type = FF_IDCT_PERM_PARTTRANS;
++    if (have_neon(cpu_flags)) {
++        if (!avctx->lowres && !high_bit_depth) {
++            if (avctx->idct_algo == FF_IDCT_AUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLEAUTO ||
++                avctx->idct_algo == FF_IDCT_SIMPLENEON) {
++                c->idct_put  = ff_simple_idct_put_neon;
++                c->idct_add  = ff_simple_idct_add_neon;
++                c->idct      = ff_simple_idct_neon;
++                c->perm_type = FF_IDCT_PERM_PARTTRANS;
++            }
+         }
++
++        c->add_pixels_clamped        = ff_add_pixels_clamped_neon;
++        c->put_pixels_clamped        = ff_put_pixels_clamped_neon;
++        c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_neon;
+     }
+ }
+diff --git a/libavcodec/aarch64/idctdsp_neon.S b/libavcodec/aarch64/idctdsp_neon.S
+new file mode 100644
+index 0000000000..7f47611206
+--- /dev/null
++++ b/libavcodec/aarch64/idctdsp_neon.S
+@@ -0,0 +1,130 @@
++/*
++ * IDCT AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// Clamp 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x0]
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x1], x2
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x1], x2
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x1], x2
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x1], x2
++        st1             {v4.8b}, [x1], x2
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1]
++        ret
++endfunc
++
++// Clamp 16-bit signed block coefficients to signed 8-bit (biased by 128)
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit results
++//   x2 = row stride for results, bytes
++function ff_put_signed_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        movi            v4.8b, #128
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        sqxtn           v0.8b, v0.8h
++        sqxtn           v1.8b, v1.8h
++        sqxtn           v2.8b, v2.8h
++        sqxtn           v3.8b, v3.8h
++        sqxtn           v5.8b, v16.8h
++        add             v0.8b, v0.8b, v4.8b
++        sqxtn           v6.8b, v17.8h
++        add             v1.8b, v1.8b, v4.8b
++        sqxtn           v7.8b, v18.8h
++        add             v2.8b, v2.8b, v4.8b
++        sqxtn           v16.8b, v19.8h
++        add             v3.8b, v3.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        add             v0.8b, v5.8b, v4.8b
++        st1             {v1.8b}, [x1], x2
++        add             v1.8b, v6.8b, v4.8b
++        st1             {v2.8b}, [x1], x2
++        add             v2.8b, v7.8b, v4.8b
++        st1             {v3.8b}, [x1], x2
++        add             v3.8b, v16.8b, v4.8b
++        st1             {v0.8b}, [x1], x2
++        st1             {v1.8b}, [x1], x2
++        st1             {v2.8b}, [x1], x2
++        st1             {v3.8b}, [x1]
++        ret
++endfunc
++
++// Add 16-bit signed block coefficients to unsigned 8-bit
++// On entry:
++//   x0 -> array of 64x 16-bit coefficients
++//   x1 -> 8-bit input and results
++//   x2 = row stride for 8-bit input and results, bytes
++function ff_add_pixels_clamped_neon, export=1
++        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #64
++        mov             x3, x1
++        ld1             {v4.8b}, [x1], x2
++        ld1             {v5.8b}, [x1], x2
++        ld1             {v6.8b}, [x1], x2
++        ld1             {v7.8b}, [x1], x2
++        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0]
++        uaddw           v0.8h, v0.8h, v4.8b
++        uaddw           v1.8h, v1.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        ld1             {v4.8b}, [x1], x2
++        uaddw           v3.8h, v3.8h, v7.8b
++        ld1             {v5.8b}, [x1], x2
++        sqxtun          v0.8b, v0.8h
++        ld1             {v6.8b}, [x1], x2
++        sqxtun          v1.8b, v1.8h
++        ld1             {v7.8b}, [x1]
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        uaddw           v4.8h, v16.8h, v4.8b
++        st1             {v0.8b}, [x3], x2
++        uaddw           v0.8h, v17.8h, v5.8b
++        st1             {v1.8b}, [x3], x2
++        uaddw           v1.8h, v18.8h, v6.8b
++        st1             {v2.8b}, [x3], x2
++        uaddw           v2.8h, v19.8h, v7.8b
++        sqxtun          v4.8b, v4.8h
++        sqxtun          v0.8b, v0.8h
++        st1             {v3.8b}, [x3], x2
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        st1             {v4.8b}, [x3], x2
++        st1             {v0.8b}, [x3], x2
++        st1             {v1.8b}, [x3], x2
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
+diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+index 13dfd74940..a7976fd596 100644
+--- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
++++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
+@@ -21,10 +21,28 @@
+ #include "libavutil/attributes.h"
+ #include "libavutil/cpu.h"
+ #include "libavutil/aarch64/cpu.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ 
+ #include "config.h"
+ 
++void ff_vc1_inv_trans_8x8_neon(int16_t *block);
++void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_inv_trans_8x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
++
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, ptrdiff_t stride, int pq);
++
+ void ff_put_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ void ff_avg_vc1_chroma_mc8_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+@@ -34,14 +52,90 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
+ {
+     int cpu_flags = av_get_cpu_flags();
+ 
+     if (have_neon(cpu_flags)) {
++        dsp->vc1_inv_trans_8x8 = ff_vc1_inv_trans_8x8_neon;
++        dsp->vc1_inv_trans_8x4 = ff_vc1_inv_trans_8x4_neon;
++        dsp->vc1_inv_trans_4x8 = ff_vc1_inv_trans_4x8_neon;
++        dsp->vc1_inv_trans_4x4 = ff_vc1_inv_trans_4x4_neon;
++        dsp->vc1_inv_trans_8x8_dc = ff_vc1_inv_trans_8x8_dc_neon;
++        dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
++        dsp->vc1_inv_trans_4x8_dc = ff_vc1_inv_trans_4x8_dc_neon;
++        dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
++
++        dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++        dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++        dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++        dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++        dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++        dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+         dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+         dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++        dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+     }
+ }
+diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
+new file mode 100644
+index 0000000000..9a96c2523c
+--- /dev/null
++++ b/libavcodec/aarch64/vc1dsp_neon.S
+@@ -0,0 +1,1546 @@
++/*
++ * VC1 AArch64 NEON optimisations
++ *
++ * Copyright (c) 2022 Ben Avison <bavison@riscosopen.org>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/aarch64/asm.S"
++
++// VC-1 8x8 inverse transform
++// On entry:
++//   x0 -> array of 16-bit inverse transform coefficients, in column-major order
++// On exit:
++//   array at x0 updated to hold transformed block; also now held in row-major order
++function ff_vc1_inv_trans_8x8_neon, export=1
++        ld1             {v1.16b, v2.16b}, [x0], #32
++        ld1             {v3.16b, v4.16b}, [x0], #32
++        ld1             {v5.16b, v6.16b}, [x0], #32
++        shl             v1.8h, v1.8h, #2        //         8/2 * src[0]
++        sub             x1, x0, #3*32
++        ld1             {v16.16b, v17.16b}, [x0]
++        shl             v7.8h, v2.8h, #4        //          16 * src[8]
++        shl             v18.8h, v2.8h, #2       //           4 * src[8]
++        shl             v19.8h, v4.8h, #4       //                        16 * src[24]
++        ldr             d0, .Lcoeffs_it8
++        shl             v5.8h, v5.8h, #2        //                                      8/2 * src[32]
++        shl             v20.8h, v6.8h, #4       //                                       16 * src[40]
++        shl             v21.8h, v6.8h, #2       //                                        4 * src[40]
++        shl             v22.8h, v17.8h, #4      //                                                      16 * src[56]
++        ssra            v20.8h, v19.8h, #2      //                         4 * src[24] + 16 * src[40]
++        mul             v23.8h, v3.8h, v0.h[0]  //                       6/2 * src[16]
++        sub             v19.8h, v19.8h, v21.8h  //                        16 * src[24] -  4 * src[40]
++        ssra            v7.8h, v22.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        sub             v18.8h, v22.8h, v18.8h  //        -  4 * src[8]                               + 16 * src[56]
++        shl             v3.8h, v3.8h, #3        //                      16/2 * src[16]
++        mls             v20.8h, v2.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v1.8h, v1.8h, #1        //        12/2 * src[0]
++        ssra            v5.8h, v5.8h, #1        //                                     12/2 * src[32]
++        mla             v7.8h, v4.8h, v0.h[2]   //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        shl             v21.8h, v16.8h, #3      //                                                    16/2 * src[48]
++        mls             v19.8h, v2.8h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        sub             v2.8h, v23.8h, v21.8h   // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v18.8h, v4.8h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        add             v4.8h, v1.8h, v5.8h     // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v1.8h, v1.8h, v5.8h     // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mla             v3.8h, v16.8h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v7.8h, v6.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v5.8h, v1.8h, v2.8h     // t6/2 = t2/2 + t4/2
++        sub             v16.8h, v1.8h, v2.8h    // t7/2 = t2/2 - t4/2
++        mla             v20.8h, v17.8h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v21.8h, v1.8h, v2.8h    // t6/2 = t2/2 + t4/2
++        add             v22.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v19.8h, v17.8h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        sub             v17.8h, v4.8h, v3.8h    // t8/2 = t1/2 - t3/2
++        add             v23.8h, v4.8h, v3.8h    // t5/2 = t1/2 + t3/2
++        mls             v18.8h, v6.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v1.8h, v1.8h, v2.8h     // t7/2 = t2/2 - t4/2
++        sub             v2.8h, v4.8h, v3.8h     // t8/2 = t1/2 - t3/2
++        neg             v3.8h, v7.8h            // -t1
++        neg             v4.8h, v20.8h           // +t2
++        neg             v6.8h, v19.8h           // +t3
++        ssra            v22.8h, v7.8h, #1       // (t5 + t1) >> 1
++        ssra            v1.8h, v19.8h, #1       // (t7 - t3) >> 1
++        neg             v7.8h, v18.8h           // +t4
++        ssra            v5.8h, v4.8h, #1        // (t6 + t2) >> 1
++        ssra            v16.8h, v6.8h, #1       // (t7 + t3) >> 1
++        ssra            v2.8h, v18.8h, #1       // (t8 - t4) >> 1
++        ssra            v17.8h, v7.8h, #1       // (t8 + t4) >> 1
++        ssra            v21.8h, v20.8h, #1      // (t6 - t2) >> 1
++        ssra            v23.8h, v3.8h, #1       // (t5 - t1) >> 1
++        srshr           v3.8h, v22.8h, #2       // (t5 + t1 + 4) >> 3
++        srshr           v4.8h, v5.8h, #2        // (t6 + t2 + 4) >> 3
++        srshr           v5.8h, v16.8h, #2       // (t7 + t3 + 4) >> 3
++        srshr           v6.8h, v17.8h, #2       // (t8 + t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t8 - t4 + 4) >> 3
++        srshr           v1.8h, v1.8h, #2        // (t7 - t3 + 4) >> 3
++        srshr           v7.8h, v21.8h, #2       // (t6 - t2 + 4) >> 3
++        srshr           v16.8h, v23.8h, #2      // (t5 - t1 + 4) >> 3
++        trn2            v17.8h, v3.8h, v4.8h
++        trn2            v18.8h, v5.8h, v6.8h
++        trn2            v19.8h, v2.8h, v1.8h
++        trn2            v20.8h, v7.8h, v16.8h
++        trn1            v21.4s, v17.4s, v18.4s
++        trn2            v17.4s, v17.4s, v18.4s
++        trn1            v18.4s, v19.4s, v20.4s
++        trn2            v19.4s, v19.4s, v20.4s
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.2d, v21.2d, v18.2d
++        trn1            v20.2d, v17.2d, v19.2d
++        trn1            v5.8h, v5.8h, v6.8h
++        trn1            v1.8h, v2.8h, v1.8h
++        trn1            v2.8h, v7.8h, v16.8h
++        trn1            v6.2d, v21.2d, v18.2d
++        trn2            v7.2d, v17.2d, v19.2d
++        shl             v16.8h, v20.8h, #4      //                        16 * src[24]
++        shl             v17.8h, v4.8h, #4       //                                       16 * src[40]
++        trn1            v18.4s, v3.4s, v5.4s
++        trn1            v19.4s, v1.4s, v2.4s
++        shl             v21.8h, v7.8h, #4       //                                                      16 * src[56]
++        shl             v22.8h, v6.8h, #2       //           4 * src[8]
++        shl             v23.8h, v4.8h, #2       //                                        4 * src[40]
++        trn2            v3.4s, v3.4s, v5.4s
++        trn2            v1.4s, v1.4s, v2.4s
++        shl             v2.8h, v6.8h, #4        //          16 * src[8]
++        sub             v5.8h, v16.8h, v23.8h   //                        16 * src[24] -  4 * src[40]
++        ssra            v17.8h, v16.8h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v16.8h, v21.8h, v22.8h  //        -  4 * src[8]                               + 16 * src[56]
++        trn1            v22.2d, v18.2d, v19.2d
++        trn2            v18.2d, v18.2d, v19.2d
++        trn1            v19.2d, v3.2d, v1.2d
++        ssra            v2.8h, v21.8h, #2       //          16 * src[8]                               +  4 * src[56]
++        mls             v17.8h, v6.8h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        shl             v21.8h, v22.8h, #2      //         8/2 * src[0]
++        shl             v18.8h, v18.8h, #2      //                                      8/2 * src[32]
++        mls             v5.8h, v6.8h, v0.h[1]   //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        shl             v6.8h, v19.8h, #3       //                      16/2 * src[16]
++        trn2            v1.2d, v3.2d, v1.2d
++        mla             v16.8h, v20.8h, v0.h[1] //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        ssra            v21.8h, v21.8h, #1      //        12/2 * src[0]
++        ssra            v18.8h, v18.8h, #1      //                                     12/2 * src[32]
++        mul             v3.8h, v19.8h, v0.h[0]  //                       6/2 * src[16]
++        shl             v19.8h, v1.8h, #3       //                                                    16/2 * src[48]
++        mla             v2.8h, v20.8h, v0.h[2]  //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        add             v20.8h, v21.8h, v18.8h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        mla             v6.8h, v1.8h, v0.h[0]   // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        sub             v1.8h, v21.8h, v18.8h   // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        sub             v3.8h, v3.8h, v19.8h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        mla             v17.8h, v7.8h, v0.h[1]  // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        mls             v5.8h, v7.8h, v0.h[2]   // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        add             v7.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        add             v18.8h, v20.8h, v6.8h   // t5/2 = t1/2 + t3/2
++        mls             v16.8h, v4.8h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v19.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        neg             v21.8h, v17.8h          // +t2
++        mla             v2.8h, v4.8h, v0.h[1]   //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        sub             v0.8h, v20.8h, v6.8h    // t8/2 = t1/2 - t3/2
++        neg             v4.8h, v5.8h            // +t3
++        sub             v22.8h, v1.8h, v3.8h    // t7/2 = t2/2 - t4/2
++        sub             v23.8h, v20.8h, v6.8h   // t8/2 = t1/2 - t3/2
++        neg             v24.8h, v16.8h          // +t4
++        add             v6.8h, v20.8h, v6.8h    // t5/2 = t1/2 + t3/2
++        add             v1.8h, v1.8h, v3.8h     // t6/2 = t2/2 + t4/2
++        ssra            v7.8h, v21.8h, #1       // (t6 + t2) >> 1
++        neg             v3.8h, v2.8h            // -t1
++        ssra            v18.8h, v2.8h, #1       // (t5 + t1) >> 1
++        ssra            v19.8h, v4.8h, #1       // (t7 + t3) >> 1
++        ssra            v0.8h, v24.8h, #1       // (t8 + t4) >> 1
++        srsra           v23.8h, v16.8h, #1      // (t8 - t4 + 1) >> 1
++        srsra           v22.8h, v5.8h, #1       // (t7 - t3 + 1) >> 1
++        srsra           v1.8h, v17.8h, #1       // (t6 - t2 + 1) >> 1
++        srsra           v6.8h, v3.8h, #1        // (t5 - t1 + 1) >> 1
++        srshr           v2.8h, v18.8h, #6       // (t5 + t1 + 64) >> 7
++        srshr           v3.8h, v7.8h, #6        // (t6 + t2 + 64) >> 7
++        srshr           v4.8h, v19.8h, #6       // (t7 + t3 + 64) >> 7
++        srshr           v5.8h, v0.8h, #6        // (t8 + t4 + 64) >> 7
++        srshr           v16.8h, v23.8h, #6      // (t8 - t4 + 65) >> 7
++        srshr           v17.8h, v22.8h, #6      // (t7 - t3 + 65) >> 7
++        st1             {v2.16b, v3.16b}, [x1], #32
++        srshr           v0.8h, v1.8h, #6        // (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v6.8h, #6        // (t5 - t1 + 65) >> 7
++        st1             {v4.16b, v5.16b}, [x1], #32
++        st1             {v16.16b, v17.16b}, [x1], #32
++        st1             {v0.16b, v1.16b}, [x1]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_neon, export=1
++        ld1             {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
++        mov             x3, x0
++        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        ld1             {v5.8b}, [x0], x1
++        trn2            v6.4h, v1.4h, v3.4h
++        trn2            v7.4h, v2.4h, v4.4h
++        trn1            v1.4h, v1.4h, v3.4h
++        trn1            v2.4h, v2.4h, v4.4h
++        trn2            v3.4h, v16.4h, v18.4h
++        trn2            v4.4h, v17.4h, v19.4h
++        trn1            v16.4h, v16.4h, v18.4h
++        trn1            v17.4h, v17.4h, v19.4h
++        ld1             {v18.8b}, [x0], x1
++        trn1            v19.2s, v6.2s, v3.2s
++        trn2            v3.2s, v6.2s, v3.2s
++        trn1            v6.2s, v7.2s, v4.2s
++        trn2            v4.2s, v7.2s, v4.2s
++        trn1            v7.2s, v1.2s, v16.2s
++        trn1            v20.2s, v2.2s, v17.2s
++        shl             v21.4h, v19.4h, #4      //          16 * src[1]
++        trn2            v1.2s, v1.2s, v16.2s
++        shl             v16.4h, v3.4h, #4       //                        16 * src[3]
++        trn2            v2.2s, v2.2s, v17.2s
++        shl             v17.4h, v6.4h, #4       //                                      16 * src[5]
++        ld1             {v22.8b}, [x0], x1
++        shl             v23.4h, v4.4h, #4       //                                                    16 * src[7]
++        mul             v24.4h, v1.4h, v0.h[0]  //                       6/2 * src[2]
++        ld1             {v25.8b}, [x0]
++        shl             v26.4h, v19.4h, #2      //           4 * src[1]
++        shl             v27.4h, v6.4h, #2       //                                       4 * src[5]
++        ssra            v21.4h, v23.4h, #2      //          16 * src[1]                             +  4 * src[7]
++        ssra            v17.4h, v16.4h, #2      //                         4 * src[3] + 16 * src[5]
++        sub             v23.4h, v23.4h, v26.4h  //        -  4 * src[1]                             + 16 * src[7]
++        sub             v16.4h, v16.4h, v27.4h  //                        16 * src[3] -  4 * src[5]
++        shl             v7.4h, v7.4h, #2        //         8/2 * src[0]
++        shl             v20.4h, v20.4h, #2      //                                     8/2 * src[4]
++        mla             v21.4h, v3.4h, v0.h[2]  //          16 * src[1] + 15 * src[3]               +  4 * src[7]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[2]
++        mls             v17.4h, v19.4h, v0.h[2] //        - 15 * src[1] +  4 * src[3] + 16 * src[5]
++        ssra            v7.4h, v7.4h, #1        //        12/2 * src[0]
++        mls             v16.4h, v19.4h, v0.h[1] //        -  9 * src[1] + 16 * src[3] -  4 * src[5]
++        ssra            v20.4h, v20.4h, #1      //                                    12/2 * src[4]
++        mla             v23.4h, v3.4h, v0.h[1]  //        -  4 * src[1] +  9 * src[3]               + 16 * src[7]
++        shl             v3.4h, v2.4h, #3        //                                                  16/2 * src[6]
++        mla             v1.4h, v2.4h, v0.h[0]   // t3/2 =               16/2 * src[2]             +  6/2 * src[6]
++        mla             v21.4h, v6.4h, v0.h[1]  //  t1  =   16 * src[1] + 15 * src[3] +  9 * src[5] +  4 * src[7]
++        mla             v17.4h, v4.4h, v0.h[1]  // -t2  = - 15 * src[1] +  4 * src[3] + 16 * src[5] +  9 * src[7]
++        sub             v2.4h, v24.4h, v3.4h    // t4/2 =                6/2 * src[2]             - 16/2 * src[6]
++        mls             v16.4h, v4.4h, v0.h[2]  // -t3  = -  9 * src[1] + 16 * src[3] -  4 * src[5] - 15 * src[7]
++        add             v3.4h, v7.4h, v20.4h    // t1/2 = 12/2 * src[0]             + 12/2 * src[4]
++        mls             v23.4h, v6.4h, v0.h[2]  // -t4  = -  4 * src[1] +  9 * src[3] - 15 * src[5] + 16 * src[7]
++        sub             v4.4h, v7.4h, v20.4h    // t2/2 = 12/2 * src[0]             - 12/2 * src[4]
++        neg             v6.4h, v21.4h           // -t1
++        add             v7.4h, v3.4h, v1.4h     // t5/2 = t1/2 + t3/2
++        sub             v19.4h, v3.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v20.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v24.4h, v4.4h, v2.4h    // t7/2 = t2/2 - t4/2
++        add             v26.4h, v3.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        add             v27.4h, v4.4h, v2.4h    // t6/2 = t2/2 + t4/2
++        sub             v2.4h, v4.4h, v2.4h     // t7/2 = t2/2 - t4/2
++        sub             v1.4h, v3.4h, v1.4h     // t8/2 = t1/2 - t3/2
++        neg             v3.4h, v17.4h           // +t2
++        neg             v4.4h, v16.4h           // +t3
++        neg             v28.4h, v23.4h          // +t4
++        ssra            v7.4h, v21.4h, #1       // (t5 + t1) >> 1
++        ssra            v1.4h, v23.4h, #1       // (t8 - t4) >> 1
++        ssra            v20.4h, v3.4h, #1       // (t6 + t2) >> 1
++        ssra            v24.4h, v4.4h, #1       // (t7 + t3) >> 1
++        ssra            v19.4h, v28.4h, #1      // (t8 + t4) >> 1
++        ssra            v2.4h, v16.4h, #1       // (t7 - t3) >> 1
++        ssra            v27.4h, v17.4h, #1      // (t6 - t2) >> 1
++        ssra            v26.4h, v6.4h, #1       // (t5 - t1) >> 1
++        trn1            v1.2d, v7.2d, v1.2d
++        trn1            v2.2d, v20.2d, v2.2d
++        trn1            v3.2d, v24.2d, v27.2d
++        trn1            v4.2d, v19.2d, v26.2d
++        srshr           v1.8h, v1.8h, #2        // (t5 + t1 + 4) >> 3, (t8 - t4 + 4) >> 3
++        srshr           v2.8h, v2.8h, #2        // (t6 + t2 + 4) >> 3, (t7 - t3 + 4) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
++        srshr           v4.8h, v4.8h, #2        // (t8 + t4 + 4) >> 3, (t5 - t1 + 4) >> 3
++        trn2            v6.8h, v1.8h, v2.8h
++        trn1            v1.8h, v1.8h, v2.8h
++        trn2            v2.8h, v3.8h, v4.8h
++        trn1            v3.8h, v3.8h, v4.8h
++        trn2            v4.4s, v6.4s, v2.4s
++        trn1            v7.4s, v1.4s, v3.4s
++        trn2            v1.4s, v1.4s, v3.4s
++        mul             v3.8h, v4.8h, v0.h[5]   //                                                           22/2 * src[24]
++        trn1            v2.4s, v6.4s, v2.4s
++        mul             v4.8h, v4.8h, v0.h[4]   //                                                           10/2 * src[24]
++        mul             v6.8h, v7.8h, v0.h[6]   //            17 * src[0]
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[16]
++        mls             v3.8h, v2.8h, v0.h[4]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.8h, v2.8h, v0.h[5]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.8h, v6.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.8h, v6.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v2.8h, v3.8h            // -t4/2
++        neg             v6.8h, v4.8h            // -t3/2
++        ssra            v4.8h, v0.8h, #1        // (t1 + t3) >> 1
++        ssra            v2.8h, v1.8h, #1        // (t2 - t4) >> 1
++        ssra            v3.8h, v1.8h, #1        // (t2 + t4) >> 1
++        ssra            v6.8h, v0.8h, #1        // (t1 - t3) >> 1
++        srshr           v0.8h, v4.8h, #6        // (t1 + t3 + 64) >> 7
++        srshr           v1.8h, v2.8h, #6        // (t2 - t4 + 64) >> 7
++        srshr           v2.8h, v3.8h, #6        // (t2 + t4 + 64) >> 7
++        srshr           v3.8h, v6.8h, #6        // (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v18.8b
++        uaddw           v2.8h, v2.8h, v22.8b
++        uaddw           v3.8h, v3.8h, v25.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_neon, export=1
++        mov             x3, #16
++        ldr             q0, .Lcoeffs_it8        // includes 4-point coefficients in upper half of vector
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2], x3     // 30 31 32 33
++        ld1             {v1.d}[1], [x2], x3     // 40 41 42 43
++        ld1             {v2.d}[1], [x2], x3     // 50 51 52 53
++        ld1             {v3.d}[1], [x2], x3     // 60 61 62 63
++        ld1             {v4.d}[1], [x2]         // 70 71 72 73
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        ld1             {v7.s}[0], [x0], x1
++        trn2            v16.8h, v1.8h, v2.8h    // 01 11 03 13 41 51 43 53
++        trn1            v1.8h, v1.8h, v2.8h     // 00 10 02 12 40 50 42 52
++        trn2            v2.8h, v3.8h, v4.8h     // 21 31 23 33 61 71 63 73
++        trn1            v3.8h, v3.8h, v4.8h     // 20 30 22 32 60 70 62 72
++        ld1             {v4.s}[0], [x0], x1
++        trn2            v17.4s, v16.4s, v2.4s   // 03 13 23 33 43 53 63 73
++        trn1            v18.4s, v1.4s, v3.4s    // 00 10 20 30 40 50 60 70
++        trn1            v2.4s, v16.4s, v2.4s    // 01 11 21 31 41 51 61 71
++        mul             v16.8h, v17.8h, v0.h[4] //                                                          10/2 * src[3]
++        ld1             {v5.s}[1], [x0], x1
++        mul             v17.8h, v17.8h, v0.h[5] //                                                          22/2 * src[3]
++        ld1             {v6.s}[1], [x0], x1
++        trn2            v1.4s, v1.4s, v3.4s     // 02 12 22 32 42 52 62 72
++        mul             v3.8h, v18.8h, v0.h[6]  //            17 * src[0]
++        ld1             {v7.s}[1], [x0], x1
++        mul             v1.8h, v1.8h, v0.h[6]   //                                            17 * src[2]
++        ld1             {v4.s}[1], [x0]
++        mla             v16.8h, v2.8h, v0.h[5]  //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v17.8h, v2.8h, v0.h[4]  //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.8h, v3.8h, v1.8h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.8h, v3.8h, v1.8h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v3.8h, v16.8h           // -t3/2
++        ssra            v16.8h, v2.8h, #1       // (t1 + t3) >> 1
++        neg             v18.8h, v17.8h          // -t4/2
++        ssra            v17.8h, v1.8h, #1       // (t2 + t4) >> 1
++        ssra            v3.8h, v2.8h, #1        // (t1 - t3) >> 1
++        ssra            v18.8h, v1.8h, #1       // (t2 - t4) >> 1
++        srshr           v1.8h, v16.8h, #2       // (t1 + t3 + 64) >> 3
++        srshr           v2.8h, v17.8h, #2       // (t2 + t4 + 64) >> 3
++        srshr           v3.8h, v3.8h, #2        // (t1 - t3 + 64) >> 3
++        srshr           v16.8h, v18.8h, #2      // (t2 - t4 + 64) >> 3
++        trn2            v17.8h, v2.8h, v3.8h    // 12 13 32 33 52 53 72 73
++        trn2            v18.8h, v1.8h, v16.8h   // 10 11 30 31 50 51 70 71
++        trn1            v1.8h, v1.8h, v16.8h    // 00 01 20 21 40 41 60 61
++        trn1            v2.8h, v2.8h, v3.8h     // 02 03 22 23 42 43 62 63
++        trn1            v3.4s, v18.4s, v17.4s   // 10 11 12 13 50 51 52 53
++        trn2            v16.4s, v18.4s, v17.4s  // 30 31 32 33 70 71 72 73
++        trn1            v17.4s, v1.4s, v2.4s    // 00 01 02 03 40 41 42 43
++        mov             d18, v3.d[1]            // 50 51 52 53
++        shl             v19.4h, v3.4h, #4       //          16 * src[8]
++        mov             d20, v16.d[1]           // 70 71 72 73
++        shl             v21.4h, v16.4h, #4      //                        16 * src[24]
++        mov             d22, v17.d[1]           // 40 41 42 43
++        shl             v23.4h, v3.4h, #2       //           4 * src[8]
++        shl             v24.4h, v18.4h, #4      //                                       16 * src[40]
++        shl             v25.4h, v20.4h, #4      //                                                      16 * src[56]
++        shl             v26.4h, v18.4h, #2      //                                        4 * src[40]
++        trn2            v1.4s, v1.4s, v2.4s     // 20 21 22 23 60 61 62 63
++        ssra            v24.4h, v21.4h, #2      //                         4 * src[24] + 16 * src[40]
++        sub             v2.4h, v25.4h, v23.4h   //        -  4 * src[8]                               + 16 * src[56]
++        shl             v17.4h, v17.4h, #2      //         8/2 * src[0]
++        sub             v21.4h, v21.4h, v26.4h  //                        16 * src[24] -  4 * src[40]
++        shl             v22.4h, v22.4h, #2      //                                      8/2 * src[32]
++        mov             d23, v1.d[1]            // 60 61 62 63
++        ssra            v19.4h, v25.4h, #2      //          16 * src[8]                               +  4 * src[56]
++        mul             v25.4h, v1.4h, v0.h[0]  //                       6/2 * src[16]
++        shl             v1.4h, v1.4h, #3        //                      16/2 * src[16]
++        mls             v24.4h, v3.4h, v0.h[2]  //        - 15 * src[8] +  4 * src[24] + 16 * src[40]
++        ssra            v17.4h, v17.4h, #1      //        12/2 * src[0]
++        mls             v21.4h, v3.4h, v0.h[1]  //        -  9 * src[8] + 16 * src[24] -  4 * src[40]
++        ssra            v22.4h, v22.4h, #1      //                                     12/2 * src[32]
++        mla             v2.4h, v16.4h, v0.h[1]  //        -  4 * src[8] +  9 * src[24]                + 16 * src[56]
++        shl             v3.4h, v23.4h, #3       //                                                    16/2 * src[48]
++        mla             v19.4h, v16.4h, v0.h[2] //          16 * src[8] + 15 * src[24]                +  4 * src[56]
++        mla             v1.4h, v23.4h, v0.h[0]  // t3/2 =               16/2 * src[16]              +  6/2 * src[48]
++        mla             v24.4h, v20.4h, v0.h[1] // -t2  = - 15 * src[8] +  4 * src[24] + 16 * src[40] +  9 * src[56]
++        add             v16.4h, v17.4h, v22.4h  // t1/2 = 12/2 * src[0]              + 12/2 * src[32]
++        sub             v3.4h, v25.4h, v3.4h    // t4/2 =                6/2 * src[16]              - 16/2 * src[48]
++        sub             v17.4h, v17.4h, v22.4h  // t2/2 = 12/2 * src[0]              - 12/2 * src[32]
++        mls             v21.4h, v20.4h, v0.h[2] // -t3  = -  9 * src[8] + 16 * src[24] -  4 * src[40] - 15 * src[56]
++        mla             v19.4h, v18.4h, v0.h[1] //  t1  =   16 * src[8] + 15 * src[24] +  9 * src[40] +  4 * src[56]
++        add             v20.4h, v16.4h, v1.4h   // t5/2 = t1/2 + t3/2
++        mls             v2.4h, v18.4h, v0.h[2]  // -t4  = -  4 * src[8] +  9 * src[24] - 15 * src[40] + 16 * src[56]
++        sub             v0.4h, v16.4h, v1.4h    // t8/2 = t1/2 - t3/2
++        add             v18.4h, v17.4h, v3.4h   // t6/2 = t2/2 + t4/2
++        sub             v22.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        neg             v23.4h, v24.4h          // +t2
++        sub             v25.4h, v17.4h, v3.4h   // t7/2 = t2/2 - t4/2
++        add             v3.4h, v17.4h, v3.4h    // t6/2 = t2/2 + t4/2
++        neg             v17.4h, v21.4h          // +t3
++        sub             v26.4h, v16.4h, v1.4h   // t8/2 = t1/2 - t3/2
++        add             v1.4h, v16.4h, v1.4h    // t5/2 = t1/2 + t3/2
++        neg             v16.4h, v19.4h          // -t1
++        neg             v27.4h, v2.4h           // +t4
++        ssra            v20.4h, v19.4h, #1      // (t5 + t1) >> 1
++        srsra           v0.4h, v2.4h, #1        // (t8 - t4 + 1) >> 1
++        ssra            v18.4h, v23.4h, #1      // (t6 + t2) >> 1
++        srsra           v22.4h, v21.4h, #1      // (t7 - t3 + 1) >> 1
++        ssra            v25.4h, v17.4h, #1      // (t7 + t3) >> 1
++        srsra           v3.4h, v24.4h, #1       // (t6 - t2 + 1) >> 1
++        ssra            v26.4h, v27.4h, #1      // (t8 + t4) >> 1
++        srsra           v1.4h, v16.4h, #1       // (t5 - t1 + 1) >> 1
++        trn1            v0.2d, v20.2d, v0.2d
++        trn1            v2.2d, v18.2d, v22.2d
++        trn1            v3.2d, v25.2d, v3.2d
++        trn1            v1.2d, v26.2d, v1.2d
++        srshr           v0.8h, v0.8h, #6        // (t5 + t1 + 64) >> 7, (t8 - t4 + 65) >> 7
++        srshr           v2.8h, v2.8h, #6        // (t6 + t2 + 64) >> 7, (t7 - t3 + 65) >> 7
++        srshr           v3.8h, v3.8h, #6        // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t8 + t4 + 64) >> 7, (t5 - t1 + 65) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v2.8h, v2.8h, v6.8b
++        uaddw           v3.8h, v3.8h, v7.8b
++        uaddw           v1.8h, v1.8h, v4.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v2.s}[0], [x4], x1
++        st1             {v3.s}[0], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v2.s}[1], [x4], x1
++        st1             {v3.s}[1], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> array of 16-bit inverse transform coefficients, in row-major order (row stride is 8 coefficients)
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_neon, export=1
++        mov             x3, #16
++        ldr             d0, .Lcoeffs_it4
++        mov             x4, x0
++        ld1             {v1.d}[0], [x2], x3     // 00 01 02 03
++        ld1             {v2.d}[0], [x2], x3     // 10 11 12 13
++        ld1             {v3.d}[0], [x2], x3     // 20 21 22 23
++        ld1             {v4.d}[0], [x2]         // 30 31 32 33
++        ld1             {v5.s}[0], [x0], x1
++        ld1             {v5.s}[1], [x0], x1
++        ld1             {v6.s}[0], [x0], x1
++        trn2            v7.4h, v1.4h, v2.4h     // 01 11 03 13
++        trn1            v1.4h, v1.4h, v2.4h     // 00 10 02 12
++        ld1             {v6.s}[1], [x0]
++        trn2            v2.4h, v3.4h, v4.4h     // 21 31 23 33
++        trn1            v3.4h, v3.4h, v4.4h     // 20 30 22 32
++        trn2            v4.2s, v7.2s, v2.2s     // 03 13 23 33
++        trn1            v16.2s, v1.2s, v3.2s    // 00 10 20 30
++        trn1            v2.2s, v7.2s, v2.2s     // 01 11 21 31
++        trn2            v1.2s, v1.2s, v3.2s     // 02 12 22 32
++        mul             v3.4h, v4.4h, v0.h[0]   //                                                          10/2 * src[3]
++        mul             v4.4h, v4.4h, v0.h[1]   //                                                          22/2 * src[3]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[2]
++        mla             v3.4h, v2.4h, v0.h[1]   //  t3/2 =                  22/2 * src[1]                 + 10/2 * src[3]
++        mls             v4.4h, v2.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[1]                 + 22/2 * src[3]
++        add             v2.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[2]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[2]
++        neg             v7.4h, v3.4h            // -t3/2
++        neg             v16.4h, v4.4h           // -t4/2
++        ssra            v3.4h, v2.4h, #1        // (t1 + t3) >> 1
++        ssra            v4.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v16.4h, v1.4h, #1       // (t2 - t4) >> 1
++        ssra            v7.4h, v2.4h, #1        // (t1 - t3) >> 1
++        srshr           v1.4h, v3.4h, #2        // (t1 + t3 + 64) >> 3
++        srshr           v2.4h, v4.4h, #2        // (t2 + t4 + 64) >> 3
++        srshr           v3.4h, v16.4h, #2       // (t2 - t4 + 64) >> 3
++        srshr           v4.4h, v7.4h, #2        // (t1 - t3 + 64) >> 3
++        trn2            v7.4h, v1.4h, v3.4h     // 10 11 30 31
++        trn1            v1.4h, v1.4h, v3.4h     // 00 01 20 21
++        trn2            v3.4h, v2.4h, v4.4h     // 12 13 32 33
++        trn1            v2.4h, v2.4h, v4.4h     // 02 03 22 23
++        trn2            v4.2s, v7.2s, v3.2s     // 30 31 32 33
++        trn1            v16.2s, v1.2s, v2.2s    // 00 01 02 03
++        trn1            v3.2s, v7.2s, v3.2s     // 10 11 12 13
++        trn2            v1.2s, v1.2s, v2.2s     // 20 21 22 23
++        mul             v2.4h, v4.4h, v0.h[1]   //                                                           22/2 * src[24]
++        mul             v4.4h, v4.4h, v0.h[0]   //                                                           10/2 * src[24]
++        mul             v7.4h, v16.4h, v0.h[2]  //            17 * src[0]
++        mul             v1.4h, v1.4h, v0.h[2]   //                                            17 * src[16]
++        mls             v2.4h, v3.4h, v0.h[0]   //  t4/2 =                - 10/2 * src[8]                  + 22/2 * src[24]
++        mla             v4.4h, v3.4h, v0.h[1]   //  t3/2 =                  22/2 * src[8]                  + 10/2 * src[24]
++        add             v0.4h, v7.4h, v1.4h     //   t1  =    17 * src[0]                 +   17 * src[16]
++        sub             v1.4h, v7.4h, v1.4h     //   t2  =    17 * src[0]                 -   17 * src[16]
++        neg             v3.4h, v2.4h            // -t4/2
++        neg             v7.4h, v4.4h            // -t3/2
++        ssra            v4.4h, v0.4h, #1        // (t1 + t3) >> 1
++        ssra            v3.4h, v1.4h, #1        // (t2 - t4) >> 1
++        ssra            v2.4h, v1.4h, #1        // (t2 + t4) >> 1
++        ssra            v7.4h, v0.4h, #1        // (t1 - t3) >> 1
++        trn1            v0.2d, v4.2d, v3.2d
++        trn1            v1.2d, v2.2d, v7.2d
++        srshr           v0.8h, v0.8h, #6        // (t1 + t3 + 64) >> 7, (t2 - t4 + 64) >> 7
++        srshr           v1.8h, v1.8h, #6        // (t2 + t4 + 64) >> 7, (t1 - t3 + 64) >> 7
++        uaddw           v0.8h, v0.8h, v5.8b
++        uaddw           v1.8h, v1.8h, v6.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x4], x1
++        st1             {v0.s}[1], [x4], x1
++        st1             {v1.s}[0], [x4], x1
++        st1             {v1.s}[1], [x4]
++        ret
++endfunc
++
++// VC-1 8x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0], x1
++        ld1             {v4.8b}, [x0], x1
++        add             w2, w2, #1
++        ld1             {v5.8b}, [x0], x1
++        asr             w2, w2, #1
++        ld1             {v6.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v7.8b}, [x0]
++        add             w0, w2, #16
++        asr             w0, w0, #5
++        dup             v16.8h, w0
++        uaddw           v0.8h, v16.8h, v0.8b
++        uaddw           v1.8h, v16.8h, v1.8b
++        uaddw           v2.8h, v16.8h, v2.8b
++        uaddw           v3.8h, v16.8h, v3.8b
++        uaddw           v4.8h, v16.8h, v4.8b
++        uaddw           v5.8h, v16.8h, v5.8b
++        sqxtun          v0.8b, v0.8h
++        uaddw           v6.8h, v16.8h, v6.8b
++        sqxtun          v1.8b, v1.8h
++        uaddw           v7.8h, v16.8h, v7.8b
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        sqxtun          v4.8b, v4.8h
++        st1             {v0.8b}, [x3], x1
++        sqxtun          v0.8b, v5.8h
++        st1             {v1.8b}, [x3], x1
++        sqxtun          v1.8b, v6.8h
++        st1             {v2.8b}, [x3], x1
++        sqxtun          v2.8b, v7.8h
++        st1             {v3.8b}, [x3], x1
++        st1             {v4.8b}, [x3], x1
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 8x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_8x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.8b}, [x0], x1
++        ld1             {v1.8b}, [x0], x1
++        ld1             {v2.8b}, [x0], x1
++        add             w2, w2, w2, lsl #1
++        ld1             {v3.8b}, [x0]
++        add             w0, w2, #1
++        asr             w0, w0, #1
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v4.8h, w0
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3], x1
++        st1             {v2.8b}, [x3], x1
++        st1             {v3.8b}, [x3]
++        ret
++endfunc
++
++// VC-1 4x8 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x8_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v2.s}[0], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v3.s}[0], [x0], x1
++        add             w2, w2, #4
++        asr             w2, w2, #3
++        add             w2, w2, w2, lsl #1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, #16
++        asr             w2, w2, #5
++        dup             v4.8h, w2
++        ld1             {v1.s}[1], [x0], x1
++        ld1             {v2.s}[1], [x0], x1
++        ld1             {v3.s}[1], [x0]
++        uaddw           v0.8h, v4.8h, v0.8b
++        uaddw           v1.8h, v4.8h, v1.8b
++        uaddw           v2.8h, v4.8h, v2.8b
++        uaddw           v3.8h, v4.8h, v3.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        sqxtun          v2.8b, v2.8h
++        sqxtun          v3.8b, v3.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v2.s}[0], [x3], x1
++        st1             {v3.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3], x1
++        st1             {v2.s}[1], [x3], x1
++        st1             {v3.s}[1], [x3]
++        ret
++endfunc
++
++// VC-1 4x4 inverse transform, DC case
++// On entry:
++//   x0 -> array of 8-bit samples, in row-major order
++//   x1 = row stride for 8-bit sample array
++//   x2 -> 16-bit inverse transform DC coefficient
++// On exit:
++//   array at x0 updated by saturated addition of (narrowed) transformed block
++function ff_vc1_inv_trans_4x4_dc_neon, export=1
++        ldrsh           w2, [x2]
++        mov             x3, x0
++        ld1             {v0.s}[0], [x0], x1
++        ld1             {v1.s}[0], [x0], x1
++        ld1             {v0.s}[1], [x0], x1
++        add             w2, w2, w2, lsl #4
++        ld1             {v1.s}[1], [x0]
++        add             w0, w2, #4
++        asr             w0, w0, #3
++        add             w0, w0, w0, lsl #4
++        add             w0, w0, #64
++        asr             w0, w0, #7
++        dup             v2.8h, w0
++        uaddw           v0.8h, v2.8h, v0.8b
++        uaddw           v1.8h, v2.8h, v1.8b
++        sqxtun          v0.8b, v0.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3], x1
++        st1             {v0.s}[1], [x3], x1
++        st1             {v1.s}[1], [x3]
++        ret
++endfunc
++
++.align  5
++.Lcoeffs_it8:
++.quad   0x000F00090003
++.Lcoeffs_it4:
++.quad   0x0011000B0005
++.Lcoeffs:
++.quad   0x00050002
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.s}[0], [x0], x1     // P5
++        ld1             {v2.s}[0], [x3], x1     // P1
++        ld1             {v3.s}[0], [x3], x1     // P2
++        ld1             {v4.s}[0], [x0], x1     // P6
++        ld1             {v5.s}[0], [x3], x1     // P3
++        ld1             {v6.s}[0], [x0], x1     // P7
++        ld1             {v7.s}[0], [x3]         // P4
++        ld1             {v16.s}[0], [x0]        // P8
++        ushll           v17.8h, v1.8b, #1       // 2*P5
++        dup             v18.8h, w2              // pq
++        ushll           v2.8h, v2.8b, #1        // 2*P1
++        uxtl            v3.8h, v3.8b            // P2
++        uxtl            v4.8h, v4.8b            // P6
++        uxtl            v19.8h, v5.8b           // P3
++        mls             v2.4h, v3.4h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v3.8h, v6.8b            // P7
++        mls             v17.4h, v4.4h, v0.h[1]  // 2*P5-5*P6
++        ushll           v5.8h, v5.8b, #1        // 2*P3
++        uxtl            v6.8h, v7.8b            // P4
++        mla             v17.4h, v3.4h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v3.8h, v16.8b           // P8
++        mla             v2.4h, v19.4h, v0.h[1]  // 2*P1-5*P2+5*P3
++        uxtl            v1.8h, v1.8b            // P5
++        mls             v5.4h, v6.4h, v0.h[1]   // 2*P3-5*P4
++        mls             v17.4h, v3.4h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        sub             v3.4h, v6.4h, v1.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        mla             v5.4h, v1.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        mls             v5.4h, v4.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        abs             v4.4h, v3.4h
++        srshr           v7.4h, v17.4h, #3
++        srshr           v2.4h, v2.4h, #3
++        sshr            v4.4h, v4.4h, #1        // clip
++        srshr           v5.4h, v5.4h, #3
++        abs             v7.4h, v7.4h            // a2
++        sshr            v3.4h, v3.4h, #8        // clip_sign
++        abs             v2.4h, v2.4h            // a1
++        cmeq            v16.4h, v4.4h, #0       // test clip == 0
++        abs             v17.4h, v5.4h           // a0
++        sshr            v5.4h, v5.4h, #8        // a0_sign
++        cmhs            v19.4h, v2.4h, v7.4h    // test a1 >= a2
++        cmhs            v18.4h, v17.4h, v18.4h  // test a0 >= pq
++        sub             v3.4h, v3.4h, v5.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v7.8b, v2.8b    // a3
++        orr             v2.8b, v16.8b, v18.8b   // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v2.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v4.4h
++        tbnz            w0, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v4.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v2.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v6.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.4h, v0.4h, v3.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v6.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.s}[0], [x3], x1
++        st1             {v1.s}[0], [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3]
++        dup             v5.8h, w2               // pq
++        trn1            v6.8b, v1.8b, v2.8b
++        trn2            v1.8b, v1.8b, v2.8b
++        trn1            v2.8b, v3.8b, v4.8b
++        trn2            v3.8b, v3.8b, v4.8b
++        trn1            v4.4h, v6.4h, v2.4h     // P1, P5
++        trn1            v7.4h, v1.4h, v3.4h     // P2, P6
++        trn2            v2.4h, v6.4h, v2.4h     // P3, P7
++        trn2            v1.4h, v1.4h, v3.4h     // P4, P8
++        ushll           v3.8h, v4.8b, #1        // 2*P1, 2*P5
++        uxtl            v6.8h, v7.8b            // P2, P6
++        uxtl            v7.8h, v2.8b            // P3, P7
++        uxtl            v1.8h, v1.8b            // P4, P8
++        mls             v3.8h, v6.8h, v0.h[1]   // 2*P1-5*P2, 2*P5-5*P6
++        ushll           v2.8h, v2.8b, #1        // 2*P3, 2*P7
++        uxtl            v4.8h, v4.8b            // P1, P5
++        mla             v3.8h, v7.8h, v0.h[1]   // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        mov             d6, v6.d[1]             // P6
++        mls             v3.8h, v1.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        mov             d4, v4.d[1]             // P5
++        mls             v2.4h, v1.4h, v0.h[1]   // 2*P3-5*P4
++        mla             v2.4h, v4.4h, v0.h[1]   // 2*P3-5*P4+5*P5
++        sub             v7.4h, v1.4h, v4.4h     // P4-P5
++        mls             v2.4h, v6.4h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        srshr           v3.8h, v3.8h, #3
++        abs             v6.4h, v7.4h
++        sshr            v7.4h, v7.4h, #8        // clip_sign
++        srshr           v2.4h, v2.4h, #3
++        abs             v3.8h, v3.8h            // a1, a2
++        sshr            v6.4h, v6.4h, #1        // clip
++        mov             d16, v3.d[1]            // a2
++        abs             v17.4h, v2.4h           // a0
++        cmeq            v18.4h, v6.4h, #0       // test clip == 0
++        sshr            v2.4h, v2.4h, #8        // a0_sign
++        cmhs            v19.4h, v3.4h, v16.4h   // test a1 >= a2
++        cmhs            v5.4h, v17.4h, v5.4h    // test a0 >= pq
++        sub             v2.4h, v7.4h, v2.4h     // clip_sign - a0_sign
++        bsl             v19.8b, v16.8b, v3.8b   // a3
++        orr             v3.8b, v18.8b, v5.8b    // test clip == 0 || a0 >= pq
++        uqsub           v5.4h, v17.4h, v19.4h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.4h, v19.4h, v17.4h   // test a3 >= a0
++        mul             v0.4h, v5.4h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.8b, v3.8b, v7.8b     // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.4h, v0.4h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        cmhs            v5.4h, v0.4h, v6.4h
++        tbnz            w2, #0, 1f              // none of the 4 pixel pairs should be updated if this one is not filtered
++        bsl             v5.8b, v6.8b, v0.8b     // FFMIN(d, clip)
++        bic             v0.8b, v5.8b, v3.8b     // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v4.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v1.4h, v0.4h, v2.4h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v3.8b, v4.8h
++        sqxtun          v2.8b, v1.8h
++        st2             {v2.b, v3.b}[0], [x0], x1
++        st2             {v2.b, v3.b}[1], [x0], x1
++        st2             {v2.b, v3.b}[2], [x0], x1
++        st2             {v2.b, v3.b}[3], [x0]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x0], x1       // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.8b}, [x3], x1       // P1
++        ld1             {v4.8b}, [x3], x1       // P2
++        ld1             {v5.8b}, [x0], x1       // P6
++        ld1             {v6.8b}, [x3], x1       // P3
++        ld1             {v7.8b}, [x0], x1       // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5
++        ushll           v3.8h, v3.8b, #1        // 2*P1
++        ld1             {v17.8b}, [x3]          // P4
++        uxtl            v4.8h, v4.8b            // P2
++        ld1             {v18.8b}, [x0]          // P8
++        uxtl            v5.8h, v5.8b            // P6
++        dup             v19.8h, w2              // pq
++        uxtl            v20.8h, v6.8b           // P3
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v4.8h, v7.8b            // P7
++        ushll           v6.8h, v6.8b, #1        // 2*P3
++        mls             v16.8h, v5.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v7.8h, v17.8b           // P4
++        uxtl            v17.8h, v18.8b          // P8
++        mla             v16.8h, v4.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v1.8h, v1.8b            // P5
++        mla             v3.8h, v20.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v4.8h, v7.8h, v1.8h     // P4-P5
++        mls             v6.8h, v7.8h, v0.h[1]   // 2*P3-5*P4
++        mls             v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
++        abs             v17.8h, v4.8h
++        sshr            v4.8h, v4.8h, #8        // clip_sign
++        mls             v3.8h, v7.8h, v0.h[0]   // 2*P1-5*P2+5*P3-2*P4
++        sshr            v17.8h, v17.8h, #1      // clip
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v16.8h, v16.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v5.8h, v17.8h, #0       // test clip == 0
++        srshr           v3.8h, v3.8h, #3
++        abs             v16.8h, v16.8h          // a2
++        abs             v3.8h, v3.8h            // a1
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v18.8h, v3.8h, v16.8h   // test a1 >= a2
++        abs             v20.8h, v6.8h           // a0
++        sshr            v6.8h, v6.8h, #8        // a0_sign
++        bsl             v18.16b, v16.16b, v3.16b // a3
++        cmhs            v3.8h, v20.8h, v19.8h   // test a0 >= pq
++        sub             v4.8h, v4.8h, v6.8h     // clip_sign - a0_sign
++        uqsub           v6.8h, v20.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v16.8h, v18.8h, v20.8h  // test a3 >= a0
++        orr             v3.16b, v5.16b, v3.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v6.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
++        cmtst           v2.2d, v5.2d, v2.2d     // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v2.16b, v3.16b, v2.16b
++        cmhs            v3.8h, v0.8h, v17.8h
++        and             w0, w0, w2
++        bsl             v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
++        tbnz            w0, #0, 1f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v3.16b, v2.16b  // set each d to zero if it should not be filtered
++        mls             v7.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        mla             v1.8h, v0.8h, v4.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v7.8h
++        sqxtun          v1.8b, v1.8h
++        st1             {v0.8b}, [x3], x1
++        st1             {v1.8b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #2
++        ld1             {v3.8b}, [x3], x1
++        ld1             {v4.8b}, [x3], x1
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3]
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        trn1            v2.8b, v3.8b, v4.8b     // P1[2], P1[3], P3[2]...
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        dup             v4.8h, w2               // pq
++        trn1            v18.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        trn1            v6.4h, v16.4h, v2.4h    // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v19.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v20.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        trn2            v2.4h, v16.4h, v2.4h    // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn1            v3.4h, v18.4h, v20.4h   // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v16.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn2            v17.4h, v18.4h, v20.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v5.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn1            v7.2s, v6.2s, v3.2s     // P1
++        trn1            v18.2s, v19.2s, v16.2s  // P2
++        trn2            v3.2s, v6.2s, v3.2s     // P5
++        trn2            v6.2s, v19.2s, v16.2s   // P6
++        trn1            v16.2s, v2.2s, v17.2s   // P3
++        trn2            v2.2s, v2.2s, v17.2s    // P7
++        ushll           v7.8h, v7.8b, #1        // 2*P1
++        trn1            v17.2s, v1.2s, v5.2s    // P4
++        ushll           v19.8h, v3.8b, #1       // 2*P5
++        trn2            v1.2s, v1.2s, v5.2s     // P8
++        uxtl            v5.8h, v18.8b           // P2
++        uxtl            v6.8h, v6.8b            // P6
++        uxtl            v18.8h, v16.8b          // P3
++        mls             v7.8h, v5.8h, v0.h[1]   // 2*P1-5*P2
++        uxtl            v2.8h, v2.8b            // P7
++        ushll           v5.8h, v16.8b, #1       // 2*P3
++        mls             v19.8h, v6.8h, v0.h[1]  // 2*P5-5*P6
++        uxtl            v16.8h, v17.8b          // P4
++        uxtl            v1.8h, v1.8b            // P8
++        mla             v19.8h, v2.8h, v0.h[1]  // 2*P5-5*P6+5*P7
++        uxtl            v2.8h, v3.8b            // P5
++        mla             v7.8h, v18.8h, v0.h[1]  // 2*P1-5*P2+5*P3
++        sub             v3.8h, v16.8h, v2.8h    // P4-P5
++        mls             v5.8h, v16.8h, v0.h[1]  // 2*P3-5*P4
++        mls             v19.8h, v1.8h, v0.h[0]  // 2*P5-5*P6+5*P7-2*P8
++        abs             v1.8h, v3.8h
++        sshr            v3.8h, v3.8h, #8        // clip_sign
++        mls             v7.8h, v16.8h, v0.h[0]  // 2*P1-5*P2+5*P3-2*P4
++        sshr            v1.8h, v1.8h, #1        // clip
++        mla             v5.8h, v2.8h, v0.h[1]   // 2*P3-5*P4+5*P5
++        srshr           v17.8h, v19.8h, #3
++        mls             v5.8h, v6.8h, v0.h[0]   // 2*P3-5*P4+5*P5-2*P6
++        cmeq            v6.8h, v1.8h, #0        // test clip == 0
++        srshr           v7.8h, v7.8h, #3
++        abs             v17.8h, v17.8h          // a2
++        abs             v7.8h, v7.8h            // a1
++        srshr           v5.8h, v5.8h, #3
++        cmhs            v18.8h, v7.8h, v17.8h   // test a1 >= a2
++        abs             v19.8h, v5.8h           // a0
++        sshr            v5.8h, v5.8h, #8        // a0_sign
++        bsl             v18.16b, v17.16b, v7.16b // a3
++        cmhs            v4.8h, v19.8h, v4.8h    // test a0 >= pq
++        sub             v3.8h, v3.8h, v5.8h     // clip_sign - a0_sign
++        uqsub           v5.8h, v19.8h, v18.8h   // a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v7.8h, v18.8h, v19.8h   // test a3 >= a0
++        orr             v4.16b, v6.16b, v4.16b  // test clip == 0 || a0 >= pq
++        mul             v0.8h, v5.8h, v0.h[1]   // a0 >= a3 ? 5*(a0-a3) : 0
++        orr             v5.16b, v4.16b, v7.16b  // test clip == 0 || a0 >= pq || a3 >= a0
++        mov             w2, v5.s[1]             // move to gp reg
++        ushr            v0.8h, v0.8h, #3        // a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v1.8h
++        and             w5, w2, w3
++        bsl             v5.16b, v1.16b, v0.16b  // FFMIN(d, clip)
++        tbnz            w5, #0, 2f              // none of the 8 pixel pairs should be updated in this case
++        bic             v0.16b, v5.16b, v4.16b  // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v2.8h, v0.8h, v3.8h     // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        mls             v16.8h, v0.8h, v3.8h    // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        sqxtun          v1.8b, v2.8h
++        sqxtun          v0.8b, v16.8h
++        tbnz            w2, #0, 1f              // none of the first 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f              // none of the second 4 pixel pairs should be updated if so
++        st2             {v0.b, v1.b}[4], [x4], x1
++        st2             {v0.b, v1.b}[5], [x4], x1
++        st2             {v0.b, v1.b}[6], [x4], x1
++        st2             {v0.b, v1.b}[7], [x4]
++2:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of lower block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        sub             x3, x0, w1, sxtw #2
++        ldr             d0, .Lcoeffs
++        ld1             {v1.16b}, [x0], x1      // P5
++        movi            v2.2d, #0x0000ffff00000000
++        ld1             {v3.16b}, [x3], x1      // P1
++        ld1             {v4.16b}, [x3], x1      // P2
++        ld1             {v5.16b}, [x0], x1      // P6
++        ld1             {v6.16b}, [x3], x1      // P3
++        ld1             {v7.16b}, [x0], x1      // P7
++        ushll           v16.8h, v1.8b, #1       // 2*P5[0..7]
++        ushll           v17.8h, v3.8b, #1       // 2*P1[0..7]
++        ld1             {v18.16b}, [x3]         // P4
++        uxtl            v19.8h, v4.8b           // P2[0..7]
++        ld1             {v20.16b}, [x0]         // P8
++        uxtl            v21.8h, v5.8b           // P6[0..7]
++        dup             v22.8h, w2              // pq
++        ushll2          v3.8h, v3.16b, #1       // 2*P1[8..15]
++        mls             v17.8h, v19.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]
++        ushll2          v19.8h, v1.16b, #1      // 2*P5[8..15]
++        uxtl2           v4.8h, v4.16b           // P2[8..15]
++        mls             v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
++        uxtl2           v5.8h, v5.16b           // P6[8..15]
++        uxtl            v23.8h, v6.8b           // P3[0..7]
++        uxtl            v24.8h, v7.8b           // P7[0..7]
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P1[8..15]-5*P2[8..15]
++        ushll           v4.8h, v6.8b, #1        // 2*P3[0..7]
++        uxtl            v25.8h, v18.8b          // P4[0..7]
++        mls             v19.8h, v5.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]
++        uxtl2           v26.8h, v6.16b          // P3[8..15]
++        mla             v17.8h, v23.8h, v0.h[1] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl2           v7.8h, v7.16b           // P7[8..15]
++        ushll2          v6.8h, v6.16b, #1       // 2*P3[8..15]
++        mla             v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl2           v18.8h, v18.16b         // P4[8..15]
++        uxtl            v23.8h, v20.8b          // P8[0..7]
++        mls             v4.8h, v25.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        uxtl            v24.8h, v1.8b           // P5[0..7]
++        uxtl2           v20.8h, v20.16b         // P8[8..15]
++        mla             v3.8h, v26.8h, v0.h[1]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl2           v1.8h, v1.16b           // P5[8..15]
++        sub             v26.8h, v25.8h, v24.8h  // P4[0..7]-P5[0..7]
++        mla             v19.8h, v7.8h, v0.h[1]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v7.8h, v18.8h, v1.8h    // P4[8..15]-P5[8..15]
++        mls             v6.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]
++        abs             v27.8h, v26.8h
++        sshr            v26.8h, v26.8h, #8      // clip_sign[0..7]
++        mls             v17.8h, v25.8h, v0.h[0] // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        abs             v28.8h, v7.8h
++        sshr            v27.8h, v27.8h, #1      // clip[0..7]
++        mls             v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v7.8h, v7.8h, #8        // clip_sign[8..15]
++        sshr            v23.8h, v28.8h, #1      // clip[8..15]
++        mla             v4.8h, v24.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        cmeq            v28.8h, v27.8h, #0      // test clip[0..7] == 0
++        srshr           v17.8h, v17.8h, #3
++        mls             v3.8h, v18.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v29.8h, v23.8h, #0      // test clip[8..15] == 0
++        srshr           v16.8h, v16.8h, #3
++        mls             v19.8h, v20.8h, v0.h[0] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        abs             v17.8h, v17.8h          // a1[0..7]
++        mla             v6.8h, v1.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        srshr           v3.8h, v3.8h, #3
++        mls             v4.8h, v21.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v16.8h, v16.8h          // a2[0..7]
++        srshr           v19.8h, v19.8h, #3
++        mls             v6.8h, v5.8h, v0.h[0]   // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        cmhs            v5.8h, v17.8h, v16.8h   // test a1[0..7] >= a2[0..7]
++        abs             v3.8h, v3.8h            // a1[8..15]
++        srshr           v4.8h, v4.8h, #3
++        abs             v19.8h, v19.8h          // a2[8..15]
++        bsl             v5.16b, v16.16b, v17.16b // a3[0..7]
++        srshr           v6.8h, v6.8h, #3
++        cmhs            v16.8h, v3.8h, v19.8h   // test a1[8..15] >= a2[8.15]
++        abs             v17.8h, v4.8h           // a0[0..7]
++        sshr            v4.8h, v4.8h, #8        // a0_sign[0..7]
++        bsl             v16.16b, v19.16b, v3.16b // a3[8..15]
++        uqsub           v3.8h, v17.8h, v5.8h    // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        abs             v19.8h, v6.8h           // a0[8..15]
++        cmhs            v20.8h, v17.8h, v22.8h  // test a0[0..7] >= pq
++        cmhs            v5.8h, v5.8h, v17.8h    // test a3[0..7] >= a0[0..7]
++        sub             v4.8h, v26.8h, v4.8h    // clip_sign[0..7] - a0_sign[0..7]
++        sshr            v6.8h, v6.8h, #8        // a0_sign[8..15]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        uqsub           v17.8h, v19.8h, v16.8h  // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        orr             v20.16b, v28.16b, v20.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        cmhs            v21.8h, v19.8h, v22.8h  // test a0[8..15] >= pq
++        cmhs            v16.8h, v16.8h, v19.8h  // test a3[8..15] >= a0[8..15]
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        sub             v6.8h, v7.8h, v6.8h     // clip_sign[8..15] - a0_sign[8..15]
++        orr             v5.16b, v20.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        orr             v7.16b, v29.16b, v21.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        cmtst           v17.2d, v5.2d, v2.2d    // if 2nd of each group of is not filtered, then none of the others in the group should be either
++        mov             w0, v5.s[1]             // move to gp reg
++        cmhs            v19.8h, v3.8h, v27.8h
++        ushr            v0.8h, v0.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w2, v5.s[3]
++        orr             v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v16.16b, v20.16b, v17.16b
++        bsl             v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
++        cmtst           v2.2d, v5.2d, v2.2d
++        cmhs            v3.8h, v0.8h, v23.8h
++        mov             w4, v5.s[1]
++        mov             w5, v5.s[3]
++        and             w0, w0, w2
++        bic             v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        orr             v2.16b, v7.16b, v2.16b
++        bsl             v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
++        mls             v25.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             w2, w4, w5
++        bic             v0.16b, v3.16b, v2.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mla             v24.8h, v5.8h, v4.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        and             w0, w0, w2
++        mls             v18.8h, v0.8h, v6.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        sqxtun          v2.8b, v25.8h
++        tbnz            w0, #0, 1f              // none of the 16 pixel pairs should be updated in this case
++        mla             v1.8h, v0.8h, v6.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        sqxtun          v0.8b, v24.8h
++        sqxtun2         v2.16b, v18.8h
++        sqxtun2         v0.16b, v1.8h
++        st1             {v2.16b}, [x3], x1
++        st1             {v0.16b}, [x3]
++1:      ret
++endfunc
++
++// VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++// On entry:
++//   x0 -> top-left pel of right block
++//   x1 = row stride, bytes
++//   w2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        sub             x3, x0, #4              // where to start reading
++        ldr             d0, .Lcoeffs
++        ld1             {v1.8b}, [x3], x1       // P1[0], P2[0]...
++        sub             x0, x0, #1              // where to start writing
++        ld1             {v2.8b}, [x3], x1
++        add             x4, x0, x1, lsl #3
++        ld1             {v3.8b}, [x3], x1
++        add             x5, x0, x1, lsl #2
++        ld1             {v4.8b}, [x3], x1
++        add             x6, x4, x1, lsl #2
++        ld1             {v5.8b}, [x3], x1
++        ld1             {v6.8b}, [x3], x1
++        ld1             {v7.8b}, [x3], x1
++        trn1            v16.8b, v1.8b, v2.8b    // P1[0], P1[1], P3[0]...
++        ld1             {v17.8b}, [x3], x1
++        trn2            v1.8b, v1.8b, v2.8b     // P2[0], P2[1], P4[0]...
++        ld1             {v2.8b}, [x3], x1
++        trn1            v18.8b, v3.8b, v4.8b    // P1[2], P1[3], P3[2]...
++        ld1             {v19.8b}, [x3], x1
++        trn2            v3.8b, v3.8b, v4.8b     // P2[2], P2[3], P4[2]...
++        ld1             {v4.8b}, [x3], x1
++        trn1            v20.8b, v5.8b, v6.8b    // P1[4], P1[5], P3[4]...
++        ld1             {v21.8b}, [x3], x1
++        trn2            v5.8b, v5.8b, v6.8b     // P2[4], P2[5], P4[4]...
++        ld1             {v6.8b}, [x3], x1
++        trn1            v22.8b, v7.8b, v17.8b   // P1[6], P1[7], P3[6]...
++        ld1             {v23.8b}, [x3], x1
++        trn2            v7.8b, v7.8b, v17.8b    // P2[6], P2[7], P4[6]...
++        ld1             {v17.8b}, [x3], x1
++        trn1            v24.8b, v2.8b, v19.8b   // P1[8], P1[9], P3[8]...
++        ld1             {v25.8b}, [x3]
++        trn2            v2.8b, v2.8b, v19.8b    // P2[8], P2[9], P4[8]...
++        trn1            v19.4h, v16.4h, v18.4h  // P1[0], P1[1], P1[2], P1[3], P5[0]...
++        trn1            v26.8b, v4.8b, v21.8b   // P1[10], P1[11], P3[10]...
++        trn2            v4.8b, v4.8b, v21.8b    // P2[10], P2[11], P4[10]...
++        trn1            v21.4h, v1.4h, v3.4h    // P2[0], P2[1], P2[2], P2[3], P6[0]...
++        trn1            v27.4h, v20.4h, v22.4h  // P1[4], P1[5], P1[6], P1[7], P5[4]...
++        trn1            v28.8b, v6.8b, v23.8b   // P1[12], P1[13], P3[12]...
++        trn2            v6.8b, v6.8b, v23.8b    // P2[12], P2[13], P4[12]...
++        trn1            v23.4h, v5.4h, v7.4h    // P2[4], P2[5], P2[6], P2[7], P6[4]...
++        trn1            v29.4h, v24.4h, v26.4h  // P1[8], P1[9], P1[10], P1[11], P5[8]...
++        trn1            v30.8b, v17.8b, v25.8b  // P1[14], P1[15], P3[14]...
++        trn2            v17.8b, v17.8b, v25.8b  // P2[14], P2[15], P4[14]...
++        trn1            v25.4h, v2.4h, v4.4h    // P2[8], P2[9], P2[10], P2[11], P6[8]...
++        trn1            v31.2s, v19.2s, v27.2s  // P1[0..7]
++        trn2            v19.2s, v19.2s, v27.2s  // P5[0..7]
++        trn1            v27.2s, v21.2s, v23.2s  // P2[0..7]
++        trn2            v21.2s, v21.2s, v23.2s  // P6[0..7]
++        trn1            v23.4h, v28.4h, v30.4h  // P1[12], P1[13], P1[14], P1[15], P5[12]...
++        trn2            v16.4h, v16.4h, v18.4h  // P3[0], P3[1], P3[2], P3[3], P7[0]...
++        trn1            v18.4h, v6.4h, v17.4h   // P2[12], P2[13], P2[14], P2[15], P6[12]...
++        trn2            v20.4h, v20.4h, v22.4h  // P3[4], P3[5], P3[6], P3[7], P7[4]...
++        trn2            v22.4h, v24.4h, v26.4h  // P3[8], P3[9], P3[10], P3[11], P7[8]...
++        trn1            v24.2s, v29.2s, v23.2s  // P1[8..15]
++        trn2            v23.2s, v29.2s, v23.2s  // P5[8..15]
++        trn1            v26.2s, v25.2s, v18.2s  // P2[8..15]
++        trn2            v18.2s, v25.2s, v18.2s  // P6[8..15]
++        trn2            v25.4h, v28.4h, v30.4h  // P3[12], P3[13], P3[14], P3[15], P7[12]...
++        trn2            v1.4h, v1.4h, v3.4h     // P4[0], P4[1], P4[2], P4[3], P8[0]...
++        trn2            v3.4h, v5.4h, v7.4h     // P4[4], P4[5], P4[6], P4[7], P8[4]...
++        trn2            v2.4h, v2.4h, v4.4h     // P4[8], P4[9], P4[10], P4[11], P8[8]...
++        trn2            v4.4h, v6.4h, v17.4h    // P4[12], P4[13], P4[14], P4[15], P8[12]...
++        ushll           v5.8h, v31.8b, #1       // 2*P1[0..7]
++        ushll           v6.8h, v19.8b, #1       // 2*P5[0..7]
++        trn1            v7.2s, v16.2s, v20.2s   // P3[0..7]
++        uxtl            v17.8h, v27.8b          // P2[0..7]
++        trn2            v16.2s, v16.2s, v20.2s  // P7[0..7]
++        uxtl            v20.8h, v21.8b          // P6[0..7]
++        trn1            v21.2s, v22.2s, v25.2s  // P3[8..15]
++        ushll           v24.8h, v24.8b, #1      // 2*P1[8..15]
++        trn2            v22.2s, v22.2s, v25.2s  // P7[8..15]
++        ushll           v25.8h, v23.8b, #1      // 2*P5[8..15]
++        trn1            v27.2s, v1.2s, v3.2s    // P4[0..7]
++        uxtl            v26.8h, v26.8b          // P2[8..15]
++        mls             v5.8h, v17.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]
++        uxtl            v17.8h, v18.8b          // P6[8..15]
++        mls             v6.8h, v20.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]
++        trn1            v18.2s, v2.2s, v4.2s    // P4[8..15]
++        uxtl            v28.8h, v7.8b           // P3[0..7]
++        mls             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
++        uxtl            v16.8h, v16.8b          // P7[0..7]
++        uxtl            v26.8h, v21.8b          // P3[8..15]
++        mls             v25.8h, v17.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]
++        uxtl            v22.8h, v22.8b          // P7[8..15]
++        ushll           v7.8h, v7.8b, #1        // 2*P3[0..7]
++        uxtl            v27.8h, v27.8b          // P4[0..7]
++        trn2            v1.2s, v1.2s, v3.2s     // P8[0..7]
++        ushll           v3.8h, v21.8b, #1       // 2*P3[8..15]
++        trn2            v2.2s, v2.2s, v4.2s     // P8[8..15]
++        uxtl            v4.8h, v18.8b           // P4[8..15]
++        mla             v5.8h, v28.8h, v0.h[1]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        uxtl            v1.8h, v1.8b            // P8[0..7]
++        mla             v6.8h, v16.8h, v0.h[1]  // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        uxtl            v2.8h, v2.8b            // P8[8..15]
++        uxtl            v16.8h, v19.8b          // P5[0..7]
++        mla             v24.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        uxtl            v18.8h, v23.8b          // P5[8..15]
++        dup             v19.8h, w2              // pq
++        mla             v25.8h, v22.8h, v0.h[1] // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        sub             v21.8h, v27.8h, v16.8h  // P4[0..7]-P5[0..7]
++        sub             v22.8h, v4.8h, v18.8h   // P4[8..15]-P5[8..15]
++        mls             v7.8h, v27.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]
++        abs             v23.8h, v21.8h
++        mls             v3.8h, v4.8h, v0.h[1]   // 2*P3[8..15]-5*P4[8..15]
++        abs             v26.8h, v22.8h
++        sshr            v21.8h, v21.8h, #8      // clip_sign[0..7]
++        mls             v5.8h, v27.8h, v0.h[0]  // 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        sshr            v23.8h, v23.8h, #1      // clip[0..7]
++        sshr            v26.8h, v26.8h, #1      // clip[8..15]
++        mls             v6.8h, v1.8h, v0.h[0]   // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        sshr            v1.8h, v22.8h, #8       // clip_sign[8..15]
++        cmeq            v22.8h, v23.8h, #0      // test clip[0..7] == 0
++        mls             v24.8h, v4.8h, v0.h[0]  // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        cmeq            v28.8h, v26.8h, #0      // test clip[8..15] == 0
++        srshr           v5.8h, v5.8h, #3
++        mls             v25.8h, v2.8h, v0.h[0]  // 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        srshr           v2.8h, v6.8h, #3
++        mla             v7.8h, v16.8h, v0.h[1]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        srshr           v6.8h, v24.8h, #3
++        mla             v3.8h, v18.8h, v0.h[1]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        abs             v5.8h, v5.8h            // a1[0..7]
++        srshr           v24.8h, v25.8h, #3
++        mls             v3.8h, v17.8h, v0.h[0]  // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        abs             v2.8h, v2.8h            // a2[0..7]
++        abs             v6.8h, v6.8h            // a1[8..15]
++        mls             v7.8h, v20.8h, v0.h[0]  // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        abs             v17.8h, v24.8h          // a2[8..15]
++        cmhs            v20.8h, v5.8h, v2.8h    // test a1[0..7] >= a2[0..7]
++        srshr           v3.8h, v3.8h, #3
++        cmhs            v24.8h, v6.8h, v17.8h   // test a1[8..15] >= a2[8.15]
++        srshr           v7.8h, v7.8h, #3
++        bsl             v20.16b, v2.16b, v5.16b // a3[0..7]
++        abs             v2.8h, v3.8h            // a0[8..15]
++        sshr            v3.8h, v3.8h, #8        // a0_sign[8..15]
++        bsl             v24.16b, v17.16b, v6.16b // a3[8..15]
++        abs             v5.8h, v7.8h            // a0[0..7]
++        sshr            v6.8h, v7.8h, #8        // a0_sign[0..7]
++        cmhs            v7.8h, v2.8h, v19.8h    // test a0[8..15] >= pq
++        sub             v1.8h, v1.8h, v3.8h     // clip_sign[8..15] - a0_sign[8..15]
++        uqsub           v3.8h, v2.8h, v24.8h    // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v2.8h, v24.8h, v2.8h    // test a3[8..15] >= a0[8..15]
++        uqsub           v17.8h, v5.8h, v20.8h   // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        cmhs            v19.8h, v5.8h, v19.8h   // test a0[0..7] >= pq
++        orr             v7.16b, v28.16b, v7.16b // test clip[8..15] == 0 || a0[8..15] >= pq
++        sub             v6.8h, v21.8h, v6.8h    // clip_sign[0..7] - a0_sign[0..7]
++        mul             v3.8h, v3.8h, v0.h[1]   // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        cmhs            v5.8h, v20.8h, v5.8h    // test a3[0..7] >= a0[0..7]
++        orr             v19.16b, v22.16b, v19.16b // test clip[0..7] == 0 || a0[0..7] >= pq
++        mul             v0.8h, v17.8h, v0.h[1]  // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        orr             v2.16b, v7.16b, v2.16b  // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        orr             v5.16b, v19.16b, v5.16b // test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        ushr            v3.8h, v3.8h, #3        // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        mov             w7, v2.s[1]
++        mov             w8, v2.s[3]
++        ushr            v0.8h, v0.8h, #3        // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        mov             w2, v5.s[1]             // move to gp reg
++        cmhs            v2.8h, v3.8h, v26.8h
++        mov             w3, v5.s[3]
++        cmhs            v5.8h, v0.8h, v23.8h
++        bsl             v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
++        and             w9, w7, w8
++        bsl             v5.16b, v23.16b, v0.16b // FFMIN(d[0..7], clip[0..7])
++        and             w10, w2, w3
++        bic             v0.16b, v2.16b, v7.16b  // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        and             w9, w10, w9
++        bic             v2.16b, v5.16b, v19.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        mls             v4.8h, v0.8h, v1.8h     // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        tbnz            w9, #0, 4f              // none of the 16 pixel pairs should be updated in this case
++        mls             v27.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        mla             v16.8h, v2.8h, v6.8h    // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        sqxtun          v2.8b, v4.8h
++        mla             v18.8h, v0.8h, v1.8h    // invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        sqxtun          v0.8b, v27.8h
++        sqxtun          v1.8b, v16.8h
++        sqxtun          v3.8b, v18.8h
++        tbnz            w2, #0, 1f
++        st2             {v0.b, v1.b}[0], [x0], x1
++        st2             {v0.b, v1.b}[1], [x0], x1
++        st2             {v0.b, v1.b}[2], [x0], x1
++        st2             {v0.b, v1.b}[3], [x0]
++1:      tbnz            w3, #0, 2f
++        st2             {v0.b, v1.b}[4], [x5], x1
++        st2             {v0.b, v1.b}[5], [x5], x1
++        st2             {v0.b, v1.b}[6], [x5], x1
++        st2             {v0.b, v1.b}[7], [x5]
++2:      tbnz            w7, #0, 3f
++        st2             {v2.b, v3.b}[0], [x4], x1
++        st2             {v2.b, v3.b}[1], [x4], x1
++        st2             {v2.b, v3.b}[2], [x4], x1
++        st2             {v2.b, v3.b}[3], [x4]
++3:      tbnz            w8, #0, 4f
++        st2             {v2.b, v3.b}[4], [x6], x1
++        st2             {v2.b, v3.b}[5], [x6], x1
++        st2             {v2.b, v3.b}[6], [x6], x1
++        st2             {v2.b, v3.b}[7], [x6]
++4:      ret
++endfunc
++
++// Copy at most the specified number of bytes from source to destination buffer,
++// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
++// On entry:
++//   x0 -> source buffer
++//   w1 = max number of bytes to copy
++//   x2 -> destination buffer, optimally 8-byte aligned
++// On exit:
++//   w0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        // Offset by 80 to screen out cases that are too short for us to handle,
++        // and also make it easy to test for loop termination, or to determine
++        // whether we need an odd number of half-iterations of the loop.
++        subs            w1, w1, #80
++        b.mi            90f
++
++        // Set up useful constants
++        movi            v20.4s, #3, lsl #24
++        movi            v21.4s, #3, lsl #16
++
++        tst             w1, #32
++        b.ne            1f
++
++          ld1             {v0.16b, v1.16b, v2.16b}, [x0], #48
++          ext             v25.16b, v0.16b, v1.16b, #1
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++          add             w1, w1, #32
++          b               3f
++
++1:      ld1             {v3.16b, v4.16b, v5.16b}, [x0], #48
++        ext             v25.16b, v3.16b, v4.16b, #1
++        ext             v26.16b, v3.16b, v4.16b, #2
++        ext             v27.16b, v3.16b, v4.16b, #3
++        ext             v29.16b, v4.16b, v5.16b, #1
++        ext             v30.16b, v4.16b, v5.16b, #2
++        ext             v31.16b, v4.16b, v5.16b, #3
++        bic             v24.16b, v3.16b, v20.16b
++        bic             v25.16b, v25.16b, v20.16b
++        bic             v26.16b, v26.16b, v20.16b
++        bic             v27.16b, v27.16b, v20.16b
++        bic             v28.16b, v4.16b, v20.16b
++        bic             v29.16b, v29.16b, v20.16b
++        bic             v30.16b, v30.16b, v20.16b
++        bic             v31.16b, v31.16b, v20.16b
++        eor             v24.16b, v24.16b, v21.16b
++        eor             v25.16b, v25.16b, v21.16b
++        eor             v26.16b, v26.16b, v21.16b
++        eor             v27.16b, v27.16b, v21.16b
++        eor             v28.16b, v28.16b, v21.16b
++        eor             v29.16b, v29.16b, v21.16b
++        eor             v30.16b, v30.16b, v21.16b
++        eor             v31.16b, v31.16b, v21.16b
++        cmeq            v24.4s, v24.4s, #0
++        cmeq            v25.4s, v25.4s, #0
++        cmeq            v26.4s, v26.4s, #0
++        cmeq            v27.4s, v27.4s, #0
++        // Drop through...
++2:        mov             v0.16b, v5.16b
++          ld1             {v1.16b, v2.16b}, [x0], #32
++        cmeq            v28.4s, v28.4s, #0
++        cmeq            v29.4s, v29.4s, #0
++        cmeq            v30.4s, v30.4s, #0
++        cmeq            v31.4s, v31.4s, #0
++        orr             v24.16b, v24.16b, v25.16b
++        orr             v26.16b, v26.16b, v27.16b
++        orr             v28.16b, v28.16b, v29.16b
++        orr             v30.16b, v30.16b, v31.16b
++          ext             v25.16b, v0.16b, v1.16b, #1
++        orr             v22.16b, v24.16b, v26.16b
++          ext             v26.16b, v0.16b, v1.16b, #2
++          ext             v27.16b, v0.16b, v1.16b, #3
++          ext             v29.16b, v1.16b, v2.16b, #1
++        orr             v23.16b, v28.16b, v30.16b
++          ext             v30.16b, v1.16b, v2.16b, #2
++          ext             v31.16b, v1.16b, v2.16b, #3
++          bic             v24.16b, v0.16b, v20.16b
++          bic             v25.16b, v25.16b, v20.16b
++          bic             v26.16b, v26.16b, v20.16b
++        orr             v22.16b, v22.16b, v23.16b
++          bic             v27.16b, v27.16b, v20.16b
++          bic             v28.16b, v1.16b, v20.16b
++          bic             v29.16b, v29.16b, v20.16b
++          bic             v30.16b, v30.16b, v20.16b
++          bic             v31.16b, v31.16b, v20.16b
++        addv            s22, v22.4s
++          eor             v24.16b, v24.16b, v21.16b
++          eor             v25.16b, v25.16b, v21.16b
++          eor             v26.16b, v26.16b, v21.16b
++          eor             v27.16b, v27.16b, v21.16b
++          eor             v28.16b, v28.16b, v21.16b
++        mov             w3, v22.s[0]
++          eor             v29.16b, v29.16b, v21.16b
++          eor             v30.16b, v30.16b, v21.16b
++          eor             v31.16b, v31.16b, v21.16b
++          cmeq            v24.4s, v24.4s, #0
++          cmeq            v25.4s, v25.4s, #0
++          cmeq            v26.4s, v26.4s, #0
++          cmeq            v27.4s, v27.4s, #0
++        cbnz            w3, 90f
++        st1             {v3.16b, v4.16b}, [x2], #32
++3:          mov             v3.16b, v2.16b
++            ld1             {v4.16b, v5.16b}, [x0], #32
++          cmeq            v28.4s, v28.4s, #0
++          cmeq            v29.4s, v29.4s, #0
++          cmeq            v30.4s, v30.4s, #0
++          cmeq            v31.4s, v31.4s, #0
++          orr             v24.16b, v24.16b, v25.16b
++          orr             v26.16b, v26.16b, v27.16b
++          orr             v28.16b, v28.16b, v29.16b
++          orr             v30.16b, v30.16b, v31.16b
++            ext             v25.16b, v3.16b, v4.16b, #1
++          orr             v22.16b, v24.16b, v26.16b
++            ext             v26.16b, v3.16b, v4.16b, #2
++            ext             v27.16b, v3.16b, v4.16b, #3
++            ext             v29.16b, v4.16b, v5.16b, #1
++          orr             v23.16b, v28.16b, v30.16b
++            ext             v30.16b, v4.16b, v5.16b, #2
++            ext             v31.16b, v4.16b, v5.16b, #3
++            bic             v24.16b, v3.16b, v20.16b
++            bic             v25.16b, v25.16b, v20.16b
++            bic             v26.16b, v26.16b, v20.16b
++          orr             v22.16b, v22.16b, v23.16b
++            bic             v27.16b, v27.16b, v20.16b
++            bic             v28.16b, v4.16b, v20.16b
++            bic             v29.16b, v29.16b, v20.16b
++            bic             v30.16b, v30.16b, v20.16b
++            bic             v31.16b, v31.16b, v20.16b
++          addv            s22, v22.4s
++            eor             v24.16b, v24.16b, v21.16b
++            eor             v25.16b, v25.16b, v21.16b
++            eor             v26.16b, v26.16b, v21.16b
++            eor             v27.16b, v27.16b, v21.16b
++            eor             v28.16b, v28.16b, v21.16b
++          mov             w3, v22.s[0]
++            eor             v29.16b, v29.16b, v21.16b
++            eor             v30.16b, v30.16b, v21.16b
++            eor             v31.16b, v31.16b, v21.16b
++            cmeq            v24.4s, v24.4s, #0
++            cmeq            v25.4s, v25.4s, #0
++            cmeq            v26.4s, v26.4s, #0
++            cmeq            v27.4s, v27.4s, #0
++          cbnz            w3, 91f
++          st1             {v0.16b, v1.16b}, [x2], #32
++        subs            w1, w1, #64
++        b.pl            2b
++
++90:     add             w0, w1, #80
++        ret
++
++91:     sub             w1, w1, #32
++        b               90b
++endfunc
+diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
+index 2e9a3581de..d9571b437f 100644
+--- a/libavcodec/allcodecs.c
++++ b/libavcodec/allcodecs.c
+@@ -153,6 +153,7 @@ extern AVCodec ff_hap_decoder;
+ extern AVCodec ff_hevc_decoder;
+ extern AVCodec ff_hevc_qsv_decoder;
+ extern AVCodec ff_hevc_rkmpp_decoder;
++extern AVCodec ff_hevc_rpi_decoder;
+ extern AVCodec ff_hevc_v4l2m2m_decoder;
+ extern AVCodec ff_hnm4_video_decoder;
+ extern AVCodec ff_hq_hqa_decoder;
+@@ -917,6 +918,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
+     }
+ }
+ 
++static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
++{
++    const enum AVPixelFormat *pf = p->pix_fmts;
++
++    // Assume good if we lack info
++    if (pf == NULL)
++        return 1;
++    if (fmt == AV_PIX_FMT_NONE)
++        return 0;
++
++    for (; *pf != AV_PIX_FMT_NONE; ++pf) {
++        if (*pf == fmt)
++            return 1;
++    }
++    return 0;
++}
++
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
++{
++    const AVCodec *p, *experimental = NULL;
++    void *i = 0;
++
++    id= remap_deprecated_codec_id(id);
++    while ((p = av_codec_iterate(&i))) {
++        if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
++            if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
++                experimental = p;
++            } else
++                return (AVCodec *)p;
++        }
++        p = p->next;
++    }
++    return (AVCodec *)experimental;
++}
++
+ static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
+ {
+     const AVCodec *p, *experimental = NULL;
+diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
+index c4ab93aeeb..cd926f7b33 100644
+--- a/libavcodec/arm/Makefile
++++ b/libavcodec/arm/Makefile
+@@ -39,6 +39,8 @@ OBJS-$(CONFIG_AAC_DECODER)             += arm/aacpsdsp_init_arm.o       \
+                                           arm/sbrdsp_init_arm.o
+ OBJS-$(CONFIG_DCA_DECODER)             += arm/synth_filter_init_arm.o
+ OBJS-$(CONFIG_HEVC_DECODER)            += arm/hevcdsp_init_arm.o
++OBJS-$(CONFIG_HEVC_RPI_DECODER)        += arm/rpi_hevcdsp_init_arm.o    \
++                                          arm/rpi_hevcpred_init_arm.o
+ OBJS-$(CONFIG_MLP_DECODER)             += arm/mlpdsp_init_arm.o
+ OBJS-$(CONFIG_RV40_DECODER)            += arm/rv40dsp_init_arm.o
+ OBJS-$(CONFIG_SBC_ENCODER)             += arm/sbcdsp_init_arm.o
+@@ -137,10 +139,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER)        += arm/aacpsdsp_neon.o           \
+ NEON-OBJS-$(CONFIG_LLAUDDSP)           += arm/lossless_audiodsp_neon.o
+ NEON-OBJS-$(CONFIG_DCA_DECODER)        += arm/synth_filter_neon.o
+ NEON-OBJS-$(CONFIG_HEVC_DECODER)       += arm/hevcdsp_init_neon.o       \
++                                          arm/hevcdsp_idct_neon.o    \
+                                           arm/hevcdsp_deblock_neon.o    \
+                                           arm/hevcdsp_idct_neon.o       \
+                                           arm/hevcdsp_qpel_neon.o       \
+                                           arm/hevcdsp_sao_neon.o
++NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER)   += arm/rpi_hevcdsp_init_neon.o    \
++                                          arm/rpi_hevc_misc_neon.o       \
++                                          arm/rpi_hevcdsp_deblock_neon.o \
++                                          arm/rpi_hevcdsp_idct_neon.o    \
++                                          arm/rpi_hevcdsp_res8_neon.o    \
++                                          arm/rpi_hevcdsp_res16_neon.o   \
++                                          arm/rpi_hevcdsp_sao_neon.o     \
++                                          arm/rpi_hevcpred_init_neon.o   \
++                                          arm/rpi_hevcpred_intra_angular_neon.o \
++                                          arm/rpi_hevcpred_intra_dc_neon.o \
++                                          arm/rpi_hevcpred_intra_filter_neon.o \
++                                          arm/rpi_hevcpred_intra_hv_neon.o \
++                                          arm/rpi_hevcpred_intra_planar_neon.o
+ NEON-OBJS-$(CONFIG_RV30_DECODER)       += arm/rv34dsp_neon.o
+ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
+                                           arm/rv40dsp_neon.o
+diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
+index fdbf86b45e..4755f20e2e 100644
+--- a/libavcodec/arm/cabac.h
++++ b/libavcodec/arm/cabac.h
+@@ -26,83 +26,209 @@
+ #include "libavutil/internal.h"
+ #include "libavcodec/cabac.h"
+ 
++
+ #define get_cabac_inline get_cabac_inline_arm
+ static av_always_inline int get_cabac_inline_arm(CABACContext *c,
+-                                                 uint8_t *const state)
++                                                 uint8_t *state)
+ {
+-    int bit;
+-    void *reg_b, *reg_c, *tmp;
++    const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
++    int bit, ptr, low, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[bit], [%[c], %[range_off]]             \n\t"
++        "ldrb    %[ptr], [%[state]]                       \n\t"
++        "sub     %[tmp1], %[mlps_tables], %[lps_off]      \n\t"
++        "and     %[tmp2], %[bit], #0xc0                   \n\t"
++        "add     %[tmp1], %[tmp1], %[ptr]                 \n\t"
++        "ldr     %[low], [%[c], %[low_off]]               \n\t"
++        "ldrb    %[tmp2], [%[tmp1], %[tmp2], lsl #1]      \n\t"
++        "sub     %[bit], %[bit], %[tmp2]                  \n\t"
++        "mov     %[tmp1], %[bit]                          \n\t"
++        "cmp     %[low], %[bit], lsl #17                  \n\t"
++        "itt     ge                                       \n\t"
++        "movge   %[tmp1], %[tmp2]                         \n\t"
++        "mvnge   %[ptr], %[ptr]                           \n\t"
++        "clz     %[tmp2], %[tmp1]                         \n\t"
++        "it      ge                                       \n\t"
++        "subge   %[low], %[low], %[bit], lsl #17          \n\t"
++        "sub     %[tmp2], %[tmp2], #23                    \n\t"
++        "and     %[bit], %[ptr], #1                       \n\t"
++        "ldrb    %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
++        "lsl     %[low], %[low], %[tmp2]                  \n\t"
++        "lsls    %[ptr], %[low], #16                      \n\t"
++        "bne     1f                                       \n\t"
++        "ldr     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsl     %[tmp2], %[tmp1], %[tmp2]                \n\t"
++#if UNCHECKED_BITSTREAM_READER
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "ldrh    %[tmp1], [%[ptr]], #2                    \n\t"
++#else
++        "ldr     %[tmp1], [%[c], %[end_off]]              \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "rbit    %[state], %[low]                         \n\t"
++        "cmp     %[tmp1], %[ptr]                          \n\t"
++#if CONFIG_THUMB
++        "it      cs                                       \n\t"
++        "ldrhcs  %[tmp1], [%[ptr]], #2                    \n\t"
++#else
++        "ldrcsh  %[tmp1], [%[ptr]], #2                    \n\t"
++#endif
++#endif
++        "clz     %[state], %[state]                       \n\t"
++        "movw    %[mlps_tables], #0xffff                  \n\t"
++        "sub     %[state], %[state], #16                  \n\t"
++        "str     %[tmp2], [%[c], %[range_off]]            \n\t"
++        "rev     %[tmp1], %[tmp1]                         \n\t"
++        "str     %[ptr], [%[c], %[ptr_off]]               \n\t"
++        "lsr     %[tmp1], %[tmp1], #15                    \n\t"
++        "sub     %[tmp1], %[tmp1], %[mlps_tables]         \n\t"
++#if CONFIG_THUMB
++        "lsl     %[tmp1], %[tmp1], %[state]               \n\t"
++        "add     %[low], %[low], %[tmp1]                  \n\t"
++#else
++        "add     %[low], %[low], %[tmp1], lsl %[state]    \n\t"
++#endif
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "b       2f                                       \n\t"
++        "1:                                               \n\t"
++        "strb    %[mlps_tables], [%[state]]               \n\t"
++        "lsl     %[tmp1], %[tmp1], %[tmp2]                \n\t"
++        "str     %[low], [%[c], %[low_off]]               \n\t"
++        "str     %[tmp1], [%[c], %[range_off]]            \n\t"
++        "2:                                               \n\t"
++    :  // Outputs
++             [state]"+r"(state),
++       [mlps_tables]"+r"(mlps_tables),
++               [bit]"=&r"(bit),
++               [ptr]"=&r"(ptr),
++               [low]"=&r"(low),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
++    :  // Inputs
++               [c]"r"(c),
++         [low_off]"J"(offsetof(CABACContext, low)),
++       [range_off]"J"(offsetof(CABACContext, range)),
++         [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++         [end_off]"J"(offsetof(CABACContext, bytestream_end)),
++         [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++    :  // Clobbers
++       "cc", "memory"
++    );
++    return bit;
++}
+ 
+-    __asm__ volatile(
+-        "ldrb       %[bit]        , [%[state]]                  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[lps_off]    \n\t"
+-        "mov        %[tmp]        , %[range]                    \n\t"
+-        "and        %[range]      , %[range]    , #0xC0         \n\t"
+-        "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
+-        "ldrb       %[range]      , [%[r_b], %[range], lsl #1]  \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "sub        %[r_c]        , %[tmp]      , %[range]      \n\t"
+-        "lsl        %[tmp]        , %[r_c]      , #17           \n\t"
+-        "cmp        %[tmp]        , %[low]                      \n\t"
+-        "it         gt                                          \n\t"
+-        "movgt      %[range]      , %[r_c]                      \n\t"
+-        "itt        cc                                          \n\t"
+-        "mvncc      %[bit]        , %[bit]                      \n\t"
+-        "subcc      %[low]        , %[low]      , %[tmp]        \n\t"
+-        "add        %[r_c]        , %[tables]   , %[mlps_off]   \n\t"
+-        "ldrb       %[tmp]        , [%[r_b], %[range]]          \n\t"
+-        "ldrb       %[r_b]        , [%[r_c], %[bit]]            \n\t"
+-        "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
+-        "uxth       %[r_c]        , %[low]                      \n\t"
+-        "strb       %[r_b]        , [%[state]]                  \n\t"
+-        "tst        %[r_c]        , %[r_c]                      \n\t"
+-        "bne        2f                                          \n\t"
+-        "ldr        %[r_c]        , [%[c], %[byte]]             \n\t"
++#define get_cabac_bypass get_cabac_bypass_arm
++static inline int get_cabac_bypass_arm(CABACContext * const c)
++{
++    uint32_t low = c->low, range, ptr, tmp;
++    int rv;
++    __asm volatile (
++        "ldr        %[range] , [%[c], %[range_off]] \n\t"
++        "mov        %[rv]    , #0                   \n\t"
++        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "lsl        %[low]   , #1                   \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
++#endif
++        "cmp        %[low]   , %[range], lsl #17    \n\t"
++        "itt         cs                              \n\t"
++        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
++        "movcs      %[rv]    , #1                   \n\t"
+ #if UNCHECKED_BITSTREAM_READER
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "add        %[r_c]        , %[r_c]      , #2            \n\t"
+-        "str        %[r_c]        , [%[c], %[byte]]             \n\t"
++        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "cmp        %[tmp]   , %[ptr]               \n\t"
++#if CONFIG_THUMB
++        "it         cs                              \n\t"
++        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
+ #else
+-        "ldr        %[r_b]        , [%[c], %[end]]              \n\t"
+-        "ldrh       %[tmp]        , [%[r_c]]                    \n\t"
+-        "cmp        %[r_c]        , %[r_b]                      \n\t"
+-        "itt        lt                                          \n\t"
+-        "addlt      %[r_c]        , %[r_c]      , #2            \n\t"
+-        "strlt      %[r_c]        , [%[c], %[byte]]             \n\t"
++        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
++#endif
+ #endif
+-        "sub        %[r_c]        , %[low]      , #1            \n\t"
+-        "add        %[r_b]        , %[tables]   , %[norm_off]   \n\t"
+-        "eor        %[r_c]        , %[low]      , %[r_c]        \n\t"
+-        "rev        %[tmp]        , %[tmp]                      \n\t"
+-        "lsr        %[r_c]        , %[r_c]      , #15           \n\t"
+-        "lsr        %[tmp]        , %[tmp]      , #15           \n\t"
+-        "ldrb       %[r_c]        , [%[r_b], %[r_c]]            \n\t"
+-        "movw       %[r_b]        , #0xFFFF                     \n\t"
+-        "sub        %[tmp]        , %[tmp]      , %[r_b]        \n\t"
+-        "rsb        %[r_c]        , %[r_c]      , #7            \n\t"
+-        "lsl        %[tmp]        , %[tmp]      , %[r_c]        \n\t"
+-        "add        %[low]        , %[low]      , %[tmp]        \n\t"
+-        "2:                                                     \n\t"
+-        :    [bit]"=&r"(bit),
+-             [low]"+&r"(c->low),
+-           [range]"+&r"(c->range),
+-             [r_b]"=&r"(reg_b),
+-             [r_c]"=&r"(reg_c),
+-             [tmp]"=&r"(tmp)
+-        :        [c]"r"(c),
+-             [state]"r"(state),
+-            [tables]"r"(ff_h264_cabac_tables),
+-              [byte]"M"(offsetof(CABACContext, bytestream)),
+-               [end]"M"(offsetof(CABACContext, bytestream_end)),
+-          [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
+-           [lps_off]"I"(H264_LPS_RANGE_OFFSET),
+-          [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
+-        : "memory", "cc"
+-        );
++        "lsls       %[range] , %[low], #16          \n\t"
++        "bne        1f                              \n\t"
+ 
+-    return bit & 1;
++        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "rev        %[tmp]   , %[tmp]               \n\t"
++        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]   , 0xFFFF               \n\t"
++        "sub        %[low]   , %[tmp]               \n\t"
++        "1:                                         \n\t"
++        "str        %[low]   , [%[c], %[low_off]]   \n\t"
++        : // Outputs
++               [rv]"=&r"(rv),
++              [low]"+r"(low),
++            [range]"=&r"(range),
++              [ptr]"=&r"(ptr),
++              [tmp]"=&r"(tmp)
++        : // Inputs
++                    [c]"r"(c),
++              [low_off]"J"(offsetof(CABACContext, low)),
++            [range_off]"J"(offsetof(CABACContext, range)),
++              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++              [end_off]"J"(offsetof(CABACContext, bytestream_end))
++        : // Clobbers
++            "memory", "cc"
++    );
++    return rv;
+ }
++
++
++#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
++static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
++{
++    uint32_t low = c->low, range, ptr, tmp;
++    __asm volatile (
++        "ldr        %[range] , [%[c], %[range_off]] \n\t"
++        "ldr        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "lsl        %[low]   , #1                   \n\t"
++#if !UNCHECKED_BITSTREAM_READER
++        "ldr        %[tmp]   , [%[c], %[end_off]]   \n\t"
++#endif
++        "cmp        %[low]   , %[range], lsl #17    \n\t"
++        "it         cs                              \n\t"
++        "subcs      %[low]   , %[low], %[range], lsl #17 \n\t"
++        "it         cc                              \n\t"
++        "rsbcc      %[rv]    , %[rv], #0            \n\t"
++#if UNCHECKED_BITSTREAM_READER
++        "ldrh       %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "cmp        %[tmp]   , %[ptr]               \n\t"
++#if CONFIG_THUMB
++        "it         cs                              \n\t"
++        "ldrhcs     %[tmp]   , [%[ptr]], #2         \n\t"
++#else
++        "ldrcsh     %[tmp]   , [%[ptr]], #2         \n\t"
++#endif
++#endif
++        "lsls       %[range] , %[low], #16          \n\t"
++        "bne        1f                              \n\t"
++
++        "str        %[ptr]   , [%[c], %[ptr_off]]   \n\t"
++        "rev        %[tmp]   , %[tmp]               \n\t"
++        "add        %[low]   , %[low], %[tmp], lsr #15 \n\t"
++        "movw       %[tmp]   , 0xFFFF               \n\t"
++        "sub        %[low]   , %[tmp]               \n\t"
++        "1:                                         \n\t"
++        "str        %[low]   , [%[c], %[low_off]]   \n\t"
++        : // Outputs
++               [rv]"+r"(rv),
++              [low]"+r"(low),
++            [range]"=&r"(range),
++              [ptr]"=&r"(ptr),
++              [tmp]"=&r"(tmp)
++        : // Inputs
++                    [c]"r"(c),
++              [low_off]"J"(offsetof(CABACContext, low)),
++            [range_off]"J"(offsetof(CABACContext, range)),
++              [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++              [end_off]"J"(offsetof(CABACContext, bytestream_end))
++        : // Clobbers
++            "memory", "cc"
++    );
++    return rv;
++}
++
+ #endif /* HAVE_ARMV6T2_INLINE */
+ 
+ #endif /* AVCODEC_ARM_CABAC_H */
+diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
+new file mode 100644
+index 0000000000..c88dec6eff
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_cabac.h
+@@ -0,0 +1,607 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVC_CABAC_H
++#define AVCODEC_ARM_HEVC_CABAC_H
++
++#include "config.h"
++#if HAVE_ARMV6T2_INLINE
++
++#define hevc_mem_bits32 hevc_mem_bits32_arm
++static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
++{
++    unsigned int n;
++    __asm__ (
++        "rev        %[n], %[x]                     \n\t"
++        : [n]"=r"(n)
++        : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
++        :
++        );
++    return n << (bits & 7);
++}
++
++
++// ---------------------------------------------------------------------------
++//
++// Helper fns - little bits of code where ARM has an instraction that the
++// compiler doesn't know about / use
++
++#define trans_scale_sat trans_scale_sat_arm
++static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    int rv;
++    int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
++
++    __asm__ (
++    "ssat %[rv], #16, %[t], ASR #1 \n\t"
++    : [rv]"=r"(rv)
++    : [t]"r"(t)
++    :
++    );
++    return rv;
++}
++
++#define update_rice update_rice_arm
++static inline void update_rice_arm(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    int t = last_coeff_abs_level_remaining << 1;
++    __asm__ (
++    "lsrs  %[t], %[t], %[shift]             \n\t"
++
++    "it    eq                               \n\t"
++    "subeq %[stat], %[stat], #1             \n\t"
++    "cmp   %[t], #6                         \n\t"
++    "adc   %[stat], %[stat], #0             \n\t"
++    "usat  %[stat], #8, %[stat]             \n\t"
++    : [stat]"+r"(*stat_coeff),
++         [t]"+r"(t)
++    :  [shift]"r"(c_rice_param)
++    : "cc"
++    );
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC get loops
++//
++// Where the loop is simple enough we can normally do 10-30% better than the
++// compiler
++
++// Get the residual greater than 1 bits
++
++#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
++static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i, reg_b, st, tmp, bit, rv;
++     __asm__ (
++         "mov        %[i]          , #0                          \n\t"
++         "mov        %[rv]         , #0                          \n\t"
++         "1:                                                     \n\t"
++         "add        %[i]          , %[i]        , #1            \n\t"
++         "cmp        %[rv]         , #0                          \n\t"
++         "ite        eq                                          \n\t"
++         "usateq     %[st]         , #2          , %[i]          \n\t"
++         "movne      %[st]         , #0                          \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "add        %[r_b]        , %[r_b]      , %[bit]        \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[tmp], lsl #1]    \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "and        %[bit]        , %[bit]      , #1            \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++         "orr        %[rv]         , %[bit]      , %[rv], lsl #1 \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "it         ne                                          \n\t"
++         "cmpne      %[n]          , %[i]                        \n\t"
++         "bne        1b                                          \n\t"
++
++// If reload is not required then we must have run out of flags to decode
++         "tst        %[tmp]        , %[tmp]                      \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
++         "cmp        %[n]          , %[i]                        \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
++#endif
++
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+r"(c->bytestream),
++                [i]"=&r"(i),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st),
++               [rv]"=&r"(rv)
++          :  [state0]"r"(state0),
++                  [n]"r"(n),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++    return rv;
++}
++
++
++// n must be > 0 on entry
++#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t * ctx_map,
++    uint8_t * p)
++{
++    unsigned int reg_b, tmp, st, bit;
++     __asm__ (
++// Get bin from map
++#if CONFIG_THUMB
++         "add        %[ctx_map]    , %[n]                        \n\t"
++         "ldrb       %[st]         , [%[ctx_map]]                \n\t"
++#else
++         "ldrb       %[st]         , [%[ctx_map], %[n]]!         \n\t"
++#endif
++         "1:                                                     \n\t"
++
++// Load state & ranges
++         "ldrb       %[bit]        , [%[state0], %[st]]          \n\t"
++         "and        %[tmp]        , %[range]    , #0xC0         \n\t"
++         "sub        %[r_b]        , %[mlps_tables], %[lps_off]  \n\t"
++         "add        %[r_b]        , %[r_b]      , %[tmp], lsl #1 \n\t"
++         "ldrb       %[tmp]        , [%[r_b], %[bit]]            \n\t"
++         "sub        %[range]      , %[range]    , %[tmp]        \n\t"
++
++         "cmp        %[low]        , %[range], lsl #17           \n\t"
++         "ittt       ge                                          \n\t"
++         "mvnge      %[bit]        , %[bit]                      \n\t"
++         "subge      %[low]        , %[low]      , %[range], lsl #17 \n\t"
++         "movge      %[range]      , %[tmp]                      \n\t"
++
++// Renorm
++         "clz        %[tmp]        , %[range]                    \n\t"
++         "ldrb       %[r_b]        , [%[mlps_tables], %[bit]]    \n\t"
++         "sub        %[tmp]        , #23                         \n\t"
++         "strb       %[r_b]        , [%[state0], %[st]]          \n\t"
++         "tst        %[bit]        , #1                          \n\t"
++         "ldrb       %[st]         , [%[ctx_map], #-1]!          \n\t"
++         "lsl        %[low]        , %[low]      , %[tmp]        \n\t"
++// GCC asm seems to need strbne written differently for thumb and arm
++#if CONFIG_THUMB
++         "it         ne                                          \n\t"
++         "strbne     %[n]          , [%[idx]]    , #1            \n\t"
++#else
++         "strneb     %[n]          , [%[idx]]    , #1            \n\t"
++#endif
++
++// There is a small speed gain from combining both conditions, using a single
++// branch and then working out what that meant later
++         "subs       %[n]          , %[n]        , #1            \n\t"
++         "lsl        %[range]      , %[range]    , %[tmp]        \n\t"
++#if CONFIG_THUMB
++         "itt        ne                                          \n\t"
++         "lslsne     %[tmp]        , %[low]      , #16           \n\t"
++#else
++         "lslnes     %[tmp]        , %[low]      , #16           \n\t"
++#endif
++         "bne        1b                                          \n\t"
++
++// If we have bits left then n must be 0 so give up now
++         "lsls       %[tmp]        , %[low]      , #16           \n\t"
++         "bne        2f                                          \n\t"
++
++// Do reload
++         "ldrh       %[tmp]        , [%[bptr]]   , #2            \n\t"
++         "rbit       %[bit]        , %[low]                      \n\t"
++         "movw       %[r_b]        , #0xFFFF                     \n\t"
++         "clz        %[bit]        , %[bit]                      \n\t"
++         "cmp        %[n]          , #0                          \n\t"
++         "rev        %[tmp]        , %[tmp]                      \n\t"
++         "sub        %[bit]        , %[bit]      , #16           \n\t"
++         "rsb        %[tmp]        , %[r_b]      , %[tmp], lsr #15 \n\t"
++
++#if CONFIG_THUMB
++         "lsl        %[tmp]        , %[tmp]      , %[bit]        \n\t"
++         "add        %[low]        , %[low]      , %[tmp]        \n\t"
++#else
++         "add        %[low]        , %[low]      , %[tmp], lsl %[bit] \n\t"
++#endif
++
++// Check to see if we still have more to do
++         "bne        1b                                          \n\t"
++         "2:                                                     \n\t"
++         :    [bit]"=&r"(bit),
++              [low]"+r"(c->low),
++            [range]"+r"(c->range),
++              [r_b]"=&r"(reg_b),
++             [bptr]"+r"(c->bytestream),
++              [idx]"+r"(p),
++                [n]"+r"(n),
++              [tmp]"=&r"(tmp),
++               [st]"=&r"(st),
++          [ctx_map]"+r"(ctx_map)
++          :  [state0]"r"(state0),
++        [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
++            [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
++         : "memory", "cc"
++    );
++
++    return p;
++}
++
++// ---------------------------------------------------------------------------
++//
++// CABAC_BY22 functions
++
++
++#define get_cabac_by22_start get_cabac_by22_start_arm
++static inline void get_cabac_by22_start_arm(CABACContext * const c)
++{
++    const uint8_t *ptr = c->bytestream;
++    register uint32_t low __asm__("r1"), range __asm__("r2");
++    uint32_t m, range8, bits;
++#if !USE_BY22_DIV
++    uintptr_t inv;
++#endif
++
++    av_assert2(offsetof (CABACContext, low) == 0);
++    av_assert2(offsetof (CABACContext, range) == 4);
++    av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
++    __asm__ volatile (
++        "ldmia   %[c], {%[low], %[range]}                         \n\t"
++        : // Outputs
++               [low]"=r"(low),
++             [range]"=r"(range)
++        : // Inputs
++                 [c]"r"(c)
++        : // Clobbers
++    );
++#if !USE_BY22_DIV
++    inv = (uintptr_t)cabac_by22_inv_range;
++#endif
++    __asm__ volatile (
++        "ldr     %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
++#if !USE_BY22_DIV
++        "uxtb    %[range8], %[range]                              \n\t"
++#endif
++        "rbit    %[bits], %[low]                                  \n\t"
++        "lsl     %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
++        "clz     %[bits], %[bits]                                 \n\t"
++        "str     %[ptr], [%[c], %[ptr_off]]                       \n\t"
++        "rev     %[m], %[m]                                       \n\t"
++        "rsb     %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
++        "eor     %[m], %[m], #0x80000000                          \n\t"
++#if !USE_BY22_DIV
++        "ldr     %[inv], [%[inv], %[range8], lsl #2]              \n\t"
++        "pkhbt   %[range], %[bits], %[range], lsl #16             \n\t"
++        "str     %[range], [%[c], %[bits_off]]                    \n\t"
++#else
++        "strh    %[bits], [%[c], %[bits_off]]                     \n\t"
++#endif
++#if CONFIG_THUMB
++        "lsr     %[m], %[ptr]                                     \n\t"
++        "eor     %[range], %[low], %[m]                           \n\t"
++#else
++        "eor     %[range], %[low], %[m], lsr %[ptr]               \n\t"
++#endif
++        : // Outputs
++               [ptr]"+&r"(ptr),
++               [low]"+&r"(low),
++             [range]"+&r"(range),
++#if !USE_BY22_DIV
++               [inv]"+&r"(inv),
++#endif
++                 [m]"=&r"(m),
++            [range8]"=&r"(range8),
++              [bits]"=&r"(bits)
++        : // Inputs
++                   [c]"r"(c),
++            [bits_off]"J"(offsetof (CABACContext, by22.bits)),
++             [ptr_off]"J"(offsetof (CABACContext, bytestream))
++        : // Clobbers
++            "memory"
++    );
++    c->low = range;
++#if !USE_BY22_DIV
++    c->range = inv;
++#endif
++}
++
++#define get_cabac_by22_peek get_cabac_by22_peek_arm
++static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
++{
++    uint32_t rv = c->low &~ 1, tmp;
++    __asm__ (
++        "cmp      %[inv] , #0                    \n\t"
++        "it       ne                             \n\t"
++        "umullne  %[tmp] , %[rv] , %[inv], %[rv] \n\t"
++        :  // Outputs
++             [rv]"+r"(rv),
++             [tmp]"=r"(tmp)
++        :  // Inputs
++             [inv]"r"(c->range)
++        :  // Clobbers
++                "cc"
++    );
++    return rv << 1;
++}
++
++#define get_cabac_by22_flush get_cabac_by22_flush_arm
++static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
++{
++    uint32_t bits, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldrh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]       \n\t"
++        "rsb     %[tmp1], %[n], #32                \n\t"
++        "add     %[bits], %[bits], %[n]            \n\t"
++        "ldrh    %[tmp2], [%[cc], %[range_off]]    \n\t"
++        "lsr     %[tmp1], %[val], %[tmp1]          \n\t"
++        "ldr     %[val], [%[cc], %[low_off]]       \n\t"
++#if CONFIG_THUMB
++        "add     %[ptr], %[ptr], %[bits], lsr #3   \n\t"
++        "ldr     %[ptr], [%[ptr]]                  \n\t"
++#else
++        "ldr     %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
++#endif
++        "mul     %[tmp1], %[tmp2], %[tmp1]         \n\t"
++        "and     %[tmp2], %[bits], #7              \n\t"
++        "strh    %[bits], [%[cc], %[bits_off]]     \n\t"
++        "rev     %[ptr], %[ptr]                    \n\t"
++        "lsl     %[tmp1], %[tmp1], #23             \n\t"
++#if CONFIG_THUMB
++        "lsl     %[val], %[n]                      \n\t"
++        "sub     %[val], %[tmp1]                   \n\t"
++#else
++        "rsb     %[val], %[tmp1], %[val], lsl %[n] \n\t"
++#endif
++        "lsl     %[ptr], %[ptr], %[tmp2]           \n\t"
++        "orr     %[val], %[val], %[ptr], lsr #9    \n\t"
++        "str     %[val], [%[cc], %[low_off]]       \n\t"
++        :  // Outputs
++            [val]"+r"(val),
++           [bits]"=&r"(bits),
++            [ptr]"=&r"(ptr),
++           [tmp1]"=&r"(tmp1),
++           [tmp2]"=&r"(tmp2)
++        :  // Inputs
++                  [cc]"r"(c),
++                   [n]"r"(n),
++            [bits_off]"J"(offsetof(CABACContext, by22.bits)),
++             [ptr_off]"J"(offsetof(CABACContext, bytestream)),
++           [range_off]"J"(offsetof(CABACContext, by22.range)),
++             [low_off]"J"(offsetof(CABACContext, low))
++        :  // Clobbers
++           "memory"
++    );
++}
++
++#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
++static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
++{
++    uint32_t last_coeff_abs_level_remaining;
++    uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
++    __asm__ volatile (
++        "ldr     %[remain], [%[cc], %[low_off]]               \n\t"
++        "ldr     %[prefix], [%[cc], %[range_off]]             \n\t"
++        "bic     %[remain], %[remain], #1                     \n\t"
++        "ldrh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[prefix], #0                                \n\t"
++        "it      ne                                           \n\t"
++        "umullne %[prefix], %[remain], %[prefix], %[remain]   \n\t"
++        "ldrh    %[range], [%[cc], %[by22_range_off]]         \n\t"
++        "lsl     %[remain], %[remain], #1                     \n\t"
++        "mvn     %[prefix], %[remain]                         \n\t"
++        "clz     %[prefix], %[prefix]                         \n\t"
++        "rsbs    %[n1], %[prefix], #2                         \n\t"
++        "bcc     1f                                           \n\t"
++        "adc     %[n1], %[rice], %[prefix]                    \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp2], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "ldr     %[range], [%[cc], %[low_off]]                \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp2]]                    \n\t"
++        "rsb     %[tmp2], %[rice], #31                        \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++#if CONFIG_THUMB
++        "lsl     %[range], %[n1]                              \n\t"
++        "sub     %[range], %[n2]                              \n\t"
++#else
++        "rsb     %[range], %[n2], %[range], lsl %[n1]         \n\t"
++#endif
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[prefix], %[rice]                    \n\t"
++#if CONFIG_THUMB
++        "lsr     %[remain], %[tmp2]                           \n\t"
++        "add     %[remain], %[n2]                             \n\t"
++#else
++        "add     %[remain], %[n2], %[remain], lsr %[tmp2]     \n\t"
++#endif
++        "b       3f                                           \n\t"
++        "1:                                                   \n\t"
++        "add     %[n2], %[rice], %[prefix], lsl #1            \n\t"
++        "cmp     %[n2], %[peek_bits_plus_2]                   \n\t"
++        "bhi     2f                                           \n\t"
++        "sub     %[n1], %[n2], #2                             \n\t"
++        "add     %[tmp2], %[tmp2], %[n1]                      \n\t"
++        "rsb     %[n2], %[n1], #32                            \n\t"
++        "strh    %[tmp2], [%[cc], %[by22_bits_off]]           \n\t"
++        "lsr     %[tmp1], %[tmp2], #3                         \n\t"
++        "lsr     %[n2], %[remain], %[n2]                      \n\t"
++        "mul     %[n2], %[range], %[n2]                       \n\t"
++        "rsb     %[range], %[rice], #34                       \n\t"
++        "ldr     %[ptr], [%[ptr], %[tmp1]]                    \n\t"
++        "and     %[tmp1], %[tmp2], #7                         \n\t"
++        "lsl     %[remain], %[remain], %[prefix]              \n\t"
++        "ldr     %[tmp2], [%[cc], %[low_off]]                 \n\t"
++        "rsb     %[prefix], %[prefix], %[range]               \n\t"
++        "orr     %[remain], %[remain], #0x80000000            \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "lsl     %[n2], %[n2], #23                            \n\t"
++        "mov     %[range], #2                                 \n\t"
++#if CONFIG_THUMB
++        "lsl     %[tmp2], %[n1]                               \n\t"
++        "sub     %[tmp2], %[n2]                               \n\t"
++#else
++        "rsb     %[tmp2], %[n2], %[tmp2], lsl %[n1]           \n\t"
++#endif
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "lsl     %[rice], %[range], %[rice]                   \n\t"
++        "orr     %[range], %[tmp2], %[ptr], lsr #9            \n\t"
++#if CONFIG_THUMB
++        "lsr     %[remain], %[prefix]                         \n\t"
++        "add     %[remain], %[rice]                           \n\t"
++#else
++        "add     %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
++#endif
++        "b       4f                                           \n\t"
++        "2:                                                   \n\t"
++        "add     %[n1], %[tmp2], %[prefix]                    \n\t"
++#if CONFIG_THUMB
++        "add     %[tmp2], %[ptr], %[n1], lsr #3               \n\t"
++        "ldr     %[tmp2], [%[tmp2]]                           \n\t"
++#else
++        "ldr     %[tmp2], [%[ptr], %[n1], lsr #3]             \n\t"
++#endif
++        "rsb     %[tmp1], %[prefix], #32                      \n\t"
++        "push    {%[rice]}                                    \n\t"
++        "and     %[rice], %[n1], #7                           \n\t"
++        "lsr     %[tmp1], %[remain], %[tmp1]                  \n\t"
++        "ldr     %[ptr], [%[cc], %[low_off]]                  \n\t"
++        "mul     %[remain], %[range], %[tmp1]                 \n\t"
++        "rev     %[tmp2], %[tmp2]                             \n\t"
++        "rsb     %[n2], %[prefix], %[n2]                      \n\t"
++        "ldr     %[tmp1], [%[cc], %[range_off]]               \n\t"
++        "lsl     %[rice], %[tmp2], %[rice]                    \n\t"
++        "sub     %[tmp2], %[n2], #2                           \n\t"
++        "lsl     %[remain], %[remain], #23                    \n\t"
++#if CONFIG_THUMB
++        "lsl     %[ptr], %[prefix]                            \n\t"
++        "rsb     %[remain], %[ptr]                            \n\t"
++#else
++        "rsb     %[remain], %[remain], %[ptr], lsl %[prefix]  \n\t"
++#endif
++        "orr     %[remain], %[remain], %[rice], lsr #9        \n\t"
++        "add     %[prefix], %[n1], %[tmp2]                    \n\t"
++        "bic     %[n1], %[remain], #1                         \n\t"
++        "ldr     %[ptr], [%[cc], %[ptr_off]]                  \n\t"
++        "cmp     %[tmp1], #0                                  \n\t"
++        "rsb     %[rice], %[tmp2], #32                        \n\t"
++        "it      ne                                           \n\t"
++        "umullne %[tmp1], %[n1], %[tmp1], %[n1]               \n\t"
++        "and     %[tmp1], %[prefix], #7                       \n\t"
++#if CONFIG_THUMB
++        "add     %[ptr], %[ptr], %[prefix], lsr #3            \n\t"
++        "ldr     %[ptr], [%[ptr]]                             \n\t"
++#else
++        "ldr     %[ptr], [%[ptr], %[prefix], lsr #3]          \n\t"
++#endif
++        "lsl     %[n1], %[n1], #1                             \n\t"
++        "lsr     %[rice], %[n1], %[rice]                      \n\t"
++        "rsb     %[n2], %[n2], #34                            \n\t"
++        "mul     %[range], %[range], %[rice]                  \n\t"
++        "pop     {%[rice]}                                    \n\t"
++        "rev     %[ptr], %[ptr]                               \n\t"
++        "orr     %[n1], %[n1], #0x80000000                    \n\t"
++        "strh    %[prefix], [%[cc], %[by22_bits_off]]         \n\t"
++        "mov     %[prefix], #2                                \n\t"
++        "lsl     %[range], %[range], #23                      \n\t"
++#if CONFIG_THUMB
++        "lsl     %[remain], %[tmp2]                           \n\t"
++        "rsb     %[range], %[remain]                          \n\t"
++#else
++        "rsb     %[range], %[range], %[remain], lsl %[tmp2]   \n\t"
++#endif
++        "lsl     %[remain], %[prefix], %[rice]                \n\t"
++#if CONFIG_THUMB
++        "lsr     %[n1], %[n2]                                 \n\t"
++        "add     %[remain], %[n1]                             \n\t"
++#else
++        "add     %[remain], %[remain], %[n1], lsr %[n2]       \n\t"
++#endif
++        "3:                                                   \n\t"
++        "lsl     %[ptr], %[ptr], %[tmp1]                      \n\t"
++        "orr     %[range], %[range], %[ptr], lsr #9           \n\t"
++        "4:                                                   \n\t"
++        "str     %[range], [%[cc], %[low_off]]                \n\t"
++        :  // Outputs
++            [remain]"=&r"(last_coeff_abs_level_remaining),
++              [rice]"+r"(rice_param),
++            [prefix]"=&r"(prefix),
++                [n1]"=&r"(n1),
++             [range]"=&r"(range),
++                [n2]"=&r"(n2),
++               [ptr]"=&r"(ptr),
++              [tmp1]"=&r"(tmp1),
++              [tmp2]"=&r"(tmp2)
++        :  // Inputs
++                          [cc]"r"(c),
++            [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
++                     [low_off]"J"(offsetof(CABACContext, low)),
++                   [range_off]"J"(offsetof(CABACContext, range)),
++               [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
++              [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
++                     [ptr_off]"J"(offsetof(CABACContext, bytestream))
++        :  // Clobbers
++           "cc", "memory"
++    );
++    return last_coeff_abs_level_remaining;
++}
++
++#endif /* HAVE_ARMV6T2_INLINE */
++
++#endif /* AVCODEC_ARM_HEVC_CABAC_H */
+diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+new file mode 100644
+index 0000000000..978b7b6947
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
+@@ -0,0 +1,183 @@
++/*
++ * ARM NEON optimised IDCT functions for HEVC decoding
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++@ Included multiple times from hevc_idct_neon.S
++@ Macros defined there
++
++#define DC_SHIFT  (15 - BIT_DEPTH)
++#define DC_ADD    (1 | (1 << (14 - BIT_DEPTH)))
++#define TRN_SHIFT (20 - BIT_DEPTH)
++
++function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q0, r1
++        vdup.16     q1, r1
++        vst1.16     {q0, q1}, [r0]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++        vst1.16     {q8, q9}, [r0], r3
++        vst1.16     {q8, q9}, [r2], r3
++        vst1.16     {q8, q9}, [r0]
++        vst1.16     {q8, q9}, [r2]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        mov         ip, #16*16
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++1:      vst1.16     {q8, q9}, [r0], r3
++        subs        ip, ip, #32
++        vst1.16     {q8, q9}, [r2], r3
++        bhi         1b
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
++        ldrsh       r1, [r0]
++        add         r2, r0, #32
++        mov         r3, #64
++        add         r1, #DC_ADD
++        mov         ip, #32*32
++        asr         r1, #DC_SHIFT
++        vdup.16     q8, r1
++        vdup.16     q9, r1
++1:      vst1.16     {q8, q9}, [r0], r3
++        subs        ip, ip, #32
++        vst1.16     {q8, q9}, [r2], r3
++        bhi         1b
++        bx lr
++endfunc
++
++
++function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
++        vldr.i32    s0, =0x00240053 // 36 and 83
++        vld1.16     {q14, q15}, [r0 :256]  // coeffs
++
++        tr4_shift   #7
++
++        vzip.16     d28, d29
++        vzip.16     d30, d31
++        vzip.32     q14, q15
++
++        tr4_shift   #TRN_SHIFT
++
++        vst4.16     {q14, q15}, [r0 :256]
++        bx lr
++
++        .ltorg
++endfunc
++
++
++
++function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
++        vmov.i32    d0, #0x4a  // 74
++        vld1.16     {q14, q15}, [r0 :256]  // coeffs
++        vmov.i32    d1, #0x1d  // 29
++        vmov.i32    d2, #0x37  // 55
++
++        tr4_luma_shift #7
++
++        vzip.16     d28, d29
++        vzip.16     d30, d31
++        vzip.32     q14, q15
++
++        tr4_luma_shift #TRN_SHIFT
++
++        vst4.16     {q14, q15}, [r0 :256]
++        bx lr
++endfunc
++
++function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
++        add      r2, r0, #16
++        adr      r3, tr4f
++        vpush    {d8-d15}
++        vld1.16  {d0, d1}, [r3]
++        mov      r3, #32
++
++        tr8_vert  d16, d17, d18, d19, d24, d25, d26, d27, q8,  q9,  \
++            "sub      r0, r0, #128-8",                              \
++            "sub      r2, r2, #128-8",                              \
++            "cmp      r1, #4"
++        ble      2f
++
++        tr8_vert  d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
++            "sub      r0, r0, #128+8",                              \
++            "sub      r2, r2, #128+8+16-32",                        \
++            "mov      r3, #64"
++
++        vzip.16  d16, d17
++        vzip.16  d18, d19
++
++        vzip.16  d20, d21
++        vzip.16  d22, d23
++        vzip.16  d28, d29
++        vzip.16  d30, d31
++        vzip.32  q10, q11
++        vzip.32  q14, q15
++1:
++        vzip.16  d24, d25
++        vzip.16  d26, d27
++        vzip.32  q8, q9
++        vzip.32  q12, q13
++
++        tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8,  q9,  TRN_SHIFT
++        tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
++
++        vpop     {d8-d15}
++        bx       lr
++
++2:      vmov.i64 q10, #0
++        sub      r0, r0, #8
++        vmov.i64 q11, #0
++        sub      r2, r2, #8+16-32
++        vmov.i64 q14, #0
++        mov      r3, #64
++        vmov.i64 q15, #0
++
++        vzip.16  d16, d17
++        vzip.16  d18, d19
++
++        b        1b
++
++endfunc
++
++#undef DC_SHIFT
++#undef DC_ADD
++#undef TRN_SHIFT
++
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
+new file mode 100644
+index 0000000000..161bb0d7c9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.S
+@@ -0,0 +1,267 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ rpi_zap_coeff_vals_neon(
++@   uint16_t * buf,          [r0]
++@   unsigned int log_n_m2)   [r1]
++
++function rpi_zap_coeff_vals_neon, export=1
++        mov      ip, #1
++        vmov.i64 q0, #0
++        teq      r1, #0
++        vmov.i64 q1, #0
++        beq      2f
++
++        lsl      ip, r1    @ 2, 4 or 8
++        add      r2, r0, #32
++        lsl      ip, r1    @ 4, 16 or 64 = number of 32-byte blocks to zero
++        mov      r3, #64
++1:      vst1.8   {q0,q1}, [r0:256], r3
++        subs     ip, #2
++        vst1.8   {q0,q1}, [r2:256], r3
++        bne      1b
++        bx       lr
++
++2:      vst1.8   {q0,q1}, [r0:256]
++        bx       lr
++endfunc
++
++@ PIC jump tables are more expensive than absolute for A32 code
++.set jent_pic, CONFIG_PIC || CONFIG_THUMB
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++T       .short ((0 + \lab) - (0 + 98b)) / 2
++A       .short (0 + \lab) - (4 + 98b)
++.else
++T       .word   1 + \lab
++A       .word   \lab
++.endif
++.endm
++
++.set expected_next, 0
++
++.macro cpy_compound val, p1, p2, drop_thru=0
++.if \p1 + \p2 != \val
++.error "Bad addition!  \p1 + \p2 != \val"
++.endif
++.if expected_next != 0 && expected_next != \val
++.error "Drop thru failure"
++.endif
++\val\():
++        push       {r0-r3}
++        bl          100\p1\()b
++        pop        {r0-r3}
++        add         r0, #\p1
++        add         r2, #\p1
++.if \drop_thru == 0
++        b           \p2\()b
++.set expected_next, 0
++.else
++.set expected_next, \p2
++.endif
++.endm
++
++@ ff_hevc_cpy_blks8x4_neon(
++@   dst         [r0]
++@   dst_stride  [r1]
++@   src         [r2]
++@   src_stride  [r3]
++@   width       [sp, #0] (bytes)
++@   height)     [sp, #4]
++@
++@ Power of 2 widths are directly coded, all others are done in stripes
++@ We expect the vast majority of calls to be power of 2
++@
++@ Currently has min width of 8, but we could make that 4 without issue
++@ Min height is 4
++
++function ff_hevc_rpi_cpy_blks8x4_neon, export=1
++        ldr         r12, [sp, #0]
++        push       {r11, lr}
++.if jent_pic
++A       adr         lr,  98f - 2
++.else
++A       adr         lr,  98f - 4
++.endif
++        lsr         r12, #3
++        ldr         r11, [sp, #(8 + 4)]
++.if jent_pic
++A       lsl         r12, #1
++A       ldrsh       lr,  [lr,  r12]
++A       add         pc,  lr
++T       tbh         [pc, r12, lsl #1]
++.else
++        @ A32 only, Thumb is always PIC
++        ldr         pc,  [lr,  r12, lsl #2]
++.endif
++
++98:
++T       .short      0 @ unused
++        jent        8f
++        jent        16f
++        jent        24f
++        jent        32f
++        jent        40f
++        jent        48f
++        jent        56f
++        jent        64f
++        jent        72f
++        jent        80f
++        jent        88f
++        jent        96f
++        jent        104f
++        jent        112f
++        jent        120f
++        jent        128f
++
++1008:
++        push       {r11, lr}
++8:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {d0 }, [r2],  r3
++        vld1.32    {d1 }, [lr],  r3
++        vld1.32    {d2 }, [r2],  r3
++        vld1.32    {d3 }, [lr],  r3
++        subs        r11,  #4
++        vst1.32    {d0 }, [r0],  r1
++        vst1.32    {d1 }, [r12], r1
++        vst1.32    {d2 }, [r0],  r1
++        vst1.32    {d3 }, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10016:
++        push       {r11, lr}
++16:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {q0 }, [r2],  r3
++        vld1.32    {q1 }, [lr],  r3
++        vld1.32    {q2 }, [r2],  r3
++        vld1.32    {q3 }, [lr],  r3
++        subs        r11, #4
++        vst1.32    {q0 }, [r0],  r1
++        vst1.32    {q1 }, [r12], r1
++        vst1.32    {q2 }, [r0],  r1
++        vst1.32    {q3 }, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10032:
++        push       {r11, lr}
++32:
++        add         lr,  r2,  r3
++        lsl         r3,  #1
++        add         r12, r0,  r1
++        lsl         r1,  #1
++1:
++        vld1.32    {q8,  q9 }, [r2],  r3
++        vld1.32    {q10, q11}, [lr],  r3
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #4
++        vst1.32    {q8,  q9 }, [r0],  r1
++        vst1.32    {q10, q11}, [r12], r1
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++10064:
++        push       {r11, lr}
++64:
++        add         lr,  r2,  #32
++        add         r12, r0,  #32
++1:
++        vld1.32    {q8,  q9 }, [r2],  r3
++        vld1.32    {q10, q11}, [lr],  r3
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #2
++        vst1.32    {q8,  q9 }, [r0],  r1
++        vst1.32    {q10, q11}, [r12], r1
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r11, pc}
++
++128:
++        push       {r4, r5}
++        @ We could do this with fewer registers if we jump around but I
++        @ have a primative urge to load sequentially
++        mov         r4,  #64
++        add         lr,  r2,  #32
++        add         r12, r0,  #32
++        sub         r3,  r4
++        sub         r1,  r4
++1:
++        vld1.32    {q8,  q9 }, [r2],  r4
++        vld1.32    {q10, q11}, [lr],  r4
++        vld1.32    {q12, q13}, [r2],  r3
++        vld1.32    {q14, q15}, [lr],  r3
++        subs        r11, #1
++        vst1.32    {q8,  q9 }, [r0],  r4
++        vst1.32    {q10, q11}, [r12], r4
++        vst1.32    {q12, q13}, [r0],  r1
++        vst1.32    {q14, q15}, [r12], r1
++        bgt         1b
++        pop        {r4, r5, r11, pc}
++
++@ Use drop_thru where we can
++cpy_compound 104, 64, 40, 1
++cpy_compound 40, 32, 8
++
++cpy_compound 112, 64, 48, 1
++cpy_compound 48, 32, 16
++
++cpy_compound 120, 64, 56, 1
++cpy_compound 56, 32, 24, 1
++cpy_compound 24, 16, 8
++
++cpy_compound 72, 64, 8
++cpy_compound 80, 64, 16
++cpy_compound 88, 64, 24
++cpy_compound 96, 64, 32
++
++
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
+new file mode 100644
+index 0000000000..9d21f6a882
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_misc_neon.h
+@@ -0,0 +1,438 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
++#define AVCODEC_ARM_RPI_HEVC_MISC_H
++
++#include "config.h"
++#if HAVE_NEON_INLINE && !CONFIG_THUMB
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_src)
++{
++    const uint8_t *src2 = src + stride_src;
++    stride_src <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d2[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d3[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d0[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.32     {d1[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.32     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {q1}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {q0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {q1}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.16     {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.16     {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.16     d0, d1                            \n\t"
++                "vst1.16     {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.16     d2, d3                            \n\t"
++                "vst1.16     {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d2[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d3[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0}, [%[dst]]!                   \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0[0]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[0]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[1]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[1]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[2]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[2]}, [%[src2]], %[stride_src] \n\t"
++                "vld1.8      {d0[3]}, [%[src]], %[stride_src]  \n\t"
++                "vld1.8      {d1[3]}, [%[src2]], %[stride_src] \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2}, [%[dst]]!                   \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vzip.8      d0, d1                            \n\t"
++                "vst1.8      {d0}, [%[dst]]                    \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vzip.8      d2, d3                            \n\t"
++                "vst1.8      {d2}, [%[dst]]                    \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [src]"+r"(src),
++                          [src2]"+r"(src2),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                      ptrdiff_t stride_dst)
++{
++    uint8_t *dst2 = dst + stride_dst;
++    stride_dst <<= 1;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.32     {q1}, [%[src]]!                   \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.32     {q0}, [%[src]]!                   \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.32     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d1[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d1[1]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.32     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.32     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.32     {d3[0]}, [%[dst]]                 \n\t"
++                "vst1.32     {d3[1]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "subs        %[height], #4                     \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.16     {d2}, [%[src]]!                   \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.16     {d0}, [%[src]]!                   \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #4                     \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.16     {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d0[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d0[3]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.16     {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.16     {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.16     {d2[2]}, [%[dst]]                 \n\t"
++                "vst1.16     {d2[3]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "subs        %[height], #8                     \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "beq         2f                                \n\t"
++                "1:                                            \n\t"
++                "vld1.8      {d2}, [%[src]]!                   \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "beq         3f                                \n\t"
++                "vld1.8      {d0}, [%[src]]!                   \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]], %[stride_dst]  \n\t"
++                "subs        %[height], #8                     \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
++                "bne         1b                                \n\t"
++                "2:                                            \n\t"
++                "vst1.8      {d0[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d0[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d0[7]}, [%[dst2]]                \n\t"
++                "b           4f                                \n\t"
++                "3:                                            \n\t"
++                "vst1.8      {d2[0]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[2]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[4]}, [%[dst]], %[stride_dst]  \n\t"
++                "vst1.8      {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
++                "vst1.8      {d2[6]}, [%[dst]]                 \n\t"
++                "vst1.8      {d2[7]}, [%[dst2]]                \n\t"
++                "4:                                            \n\t"
++                :  // Outputs
++                           [dst]"+r"(dst),
++                          [dst2]"+r"(dst2),
++                           [src]"+r"(src),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
++                                                       int pixel_shift, int height,
++                                                       ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int x, y;
++    switch (pixel_shift)
++    {
++        case 2:
++            __asm__ volatile (
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldr         %[x], [%[src]], %[stride_src] \n\t"
++                "str         %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldr         %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "str         %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "str         %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        case 1:
++            __asm__ volatile (
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrh        %[x], [%[src]], %[stride_src] \n\t"
++                "strh        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrh        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strh        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strh        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++        default:
++            __asm__ volatile (
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "sub         %[height], #2                 \n\t"
++                "1:                                        \n\t"
++                "ldrb        %[x], [%[src]], %[stride_src] \n\t"
++                "strb        %[y], [%[dst]], %[stride_dst] \n\t"
++                "ldrb        %[y], [%[src]], %[stride_src] \n\t"
++                "subs        %[height], #2                 \n\t"
++                "strb        %[x], [%[dst]], %[stride_dst] \n\t"
++                "bne         1b                            \n\t"
++                "strb        %[y], [%[dst]]                \n\t"
++                :  // Outputs
++                             [x]"=&r"(x),
++                             [y]"=&r"(y),
++                           [src]"+r"(src),
++                           [dst]"+r"(dst),
++                        [height]"+r"(height)
++                :  // Inputs
++                    [stride_src]"r"(stride_src),
++                    [stride_dst]"r"(stride_dst)
++                :  // Clobbers
++                    "cc", "memory"
++            );
++            break;
++    }
++}
++
++#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
++static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
++                                              int pixel_shift, int height,
++                                              ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    if (stride_dst == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
++    else if (stride_src == 1 << pixel_shift)
++        ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
++    else
++        ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
++}
++
++#endif /* HAVE_NEON_INLINE */
++
++#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
+diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
+new file mode 100644
+index 0000000000..325c26a49b
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevc_mv_arm.h
+@@ -0,0 +1,93 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Written by John Cox, Ben Avison
++*/
++
++#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
++#define AVCODEC_ARM_RPI_HEVC_MV_H
++
++#if HAVE_ARMV6T2_INLINE
++static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
++{
++    MvXY r;
++    __asm__ (
++        "sadd16    %[r], %[a], %[b]        \n\t"
++        : [r]"=r"(r)
++        : [a]"r"(a),
++          [b]"r"(b)
++        :
++        );
++    return r;
++}
++#define mvxy_add mvxy_add_arm
++#endif
++
++#if HAVE_ARMV6T2_INLINE
++#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
++static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
++{
++    int t;
++    __asm__ (
++    "ssat   %[td], #8,    %[td]          \n\t"
++    "ssat   %[tb], #8,    %[tb]          \n\t"
++    "eor    %[t],  %[td], %[td], asr #31 \n\t"
++    "adds   %[t],  %[t],  %[td], lsr #31 \n\t"
++    "asr    %[t],  #1                    \n\t"
++    "add    %[t],  #0x4000               \n\t"
++    "it ne                               \n\t"
++    "sdivne %[t],  %[t],  %[td]          \n\t"
++    "mov    %[td], #32                   \n\t"
++    "smlabb %[td], %[t],  %[tb], %[td]   \n\t"
++    "ssat   %[td], #13,   %[td], asr #6  \n\t"
++    "mov    %[tb], #127                  \n\t"
++    "smlatb %[t],  %[xy], %[td], %[tb]   \n\t"
++    "smlabb %[tb], %[xy], %[td], %[tb]   \n\t"
++// This takes the sign of x & y for rounding at the "wrong" point
++// (i.e. after adding 127) but for the range of values (-1,-127)
++// where it does the wrong thing you get the right answer (0) anyway
++    "add    %[t],  %[t],  %[t],  lsr #31 \n\t"
++    "add    %[xy], %[tb], %[tb], lsr #31 \n\t"
++    "ssat   %[t],  #16,   %[t],  asr #8  \n\t"
++    "ssat   %[xy], #16,   %[xy], asr #8  \n\t"
++    "pkhbt  %[xy], %[xy], %[t],  lsl #16 \n\t"
++    :
++         [t]"=&r"(t),
++        [xy]"+r"(xy),
++        [td]"+r"(td),
++        [tb]"+r"(tb)
++    :
++    :
++        "cc"
++    );
++    return xy;
++}
++#define mv_scale_xy mv_scale_xy_arm
++#endif
++#endif
++
++#endif // AVCODEC_ARM_RPI_HEVC_MV_H
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
+new file mode 100644
+index 0000000000..62b9326532
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_arm.h
+@@ -0,0 +1,26 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
++#define AVCODEC_ARM_HEVCDSP_ARM_H
++
++#include "libavcodec/rpi_hevcdsp.h"
++
++void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
+diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+new file mode 100644
+index 0000000000..88a3b4e5e7
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
+@@ -0,0 +1,1634 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
++        vsubl.u8  q0, \Q0a, \P0a
++        vsubl.u8  q1, \P1a, \Q1a
++        vdup.16   d4, r2
++        \I1
++        vshl.i16  q0, #2
++        \I2
++        vadd.i16  q0, q1
++        \I3
++        vmovl.u8  q2, d4
++        \I4
++        vneg.s16  q1, q2
++        \I5
++        vrshr.s16 q0, #3
++        \I6
++        \I7
++        \I8
++        vmin.s16  q0, q2
++        vmovl.u8  q2, \Q0a
++        vmax.s16  q0, q1
++        vaddw.u8  q1, q0, \P0a
++        vsub.i16  q0, q2, q0
++        vqmovun.s16 \P0a, q1
++        vqmovun.s16 \Q0a, q0
++.endm
++
++
++.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
++        vsubl.u8  q0, \Q0a, \P0a  @ q0a - p0a
++        lsr       r12, r2, #16
++        vsubl.u8  q1, \Q0b, \P0b  @ q0b - p0b
++        vsubl.u8  q2, \P1a, \Q1a  @ p1a - q1a
++        vsubl.u8  q3, \P1b, \Q1b  @ p1b - q1b
++        vshl.i16  q0, #2          @ (q0a - p0a) * 4
++        vshl.i16  q1, #2          @ (q0b - p0b) * 4
++        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
++        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
++        vdup.16   d4, r2          @ tc0a, tc0b
++        vdup.16   d6, r12         @ tc1a, tc1b
++        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++        \I1
++        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++        \I2
++        vmovl.u8  q2, d4          @ tc0a, tc0b
++        \I3
++        vmovl.u8  q3, d6          @ tc1a, tc1b
++        \I4
++        vmin.s16  q0, q2
++        \I5
++        vneg.s16  q2, q2          @ -tc0a, -tc0b
++        \I6
++        vmin.s16  q1, q3
++        \I7
++        vneg.s16  q3, q3          @ -tc1a, -tc1b
++        vmax.s16  q0, q2          @ delta0a
++        vmovl.u8  q2, \Q0a
++        vmax.s16  q1, q3          @ delta0b
++        vaddw.u8  q3, q0, \P0a    @ p0a + delta0a
++        vsub.i16  q0, q2, q0      @ q0a - delta0a
++        vmovl.u8  q2, \Q0b
++        vsub.i16  q2, q1          @ q0b - delta0b
++        vaddw.u8  q1, \P0b        @ p0b + delta0b
++        vqmovun.s16 \Q0a, q0
++        vqmovun.s16 \P0a, q3
++        vqmovun.s16 \Q0b, q2
++        vqmovun.s16 \P0b, q1
++.endm
++
++
++@ Preserves r12
++@ Clobbers r2
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@   [0..7]   tc U a
++@   [8..15]  tc V a
++
++.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
++        vsub.i16  q0, \Q0a, \P0a
++        vsub.i16  q1, \P1a, \Q1a
++        vdup.16   d4, r2
++        \I1
++        vshl.i16  q0, #2
++        \I2
++        vadd.i16  q0, q1
++        \I3
++        vshll.u8  q2, d4, #\bit_depth - 8
++        \I4
++        vneg.s16  q1, q2
++        \I5
++        vrshr.s16 q0, #3
++        \I6
++        \I7
++        \I8
++        vmin.s16  q0, q2
++        vmov.i16  q2, #0
++        vmax.s16  q0, q1
++        vadd.i16  \P0a, q0
++        vsub.i16  \Q0a, q0
++        vmov.i16  q1, #(1 << \bit_depth) - 1
++        vmax.s16  \P0a, q2
++        vmax.s16  \Q0a, q2
++        vmin.s16  \P0a, q1
++        vmin.s16  \Q0a, q1
++.endm
++
++@ Clobbers r2, r12
++@ P0a et al all contain UVUVUVUV
++@ r2 (tc4) contains
++@   [0..7]   tc U a
++@   [8..15]  tc V a
++@  [16..23]  tc U b
++@  [24..31]  tc V b
++
++.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
++        vsub.i16  q0, \Q0a, \P0a  @ q0a - p0a
++        lsr       r12, r2, #16
++        vsub.i16  q1, \Q0b, \P0b  @ q0b - p0b
++        vsub.i16  q2, \P1a, \Q1a  @ p1a - q1a
++        vsub.i16  q3, \P1b, \Q1b  @ p1b - q1b
++        vshl.i16  q0, #2          @ (q0a - p0a) * 4
++        vshl.i16  q1, #2          @ (q0b - p0b) * 4
++        vadd.i16  q0, q2          @ ((q0a - p0a) * 4) + p1a - q1a
++        vadd.i16  q1, q3          @ ((q0b - p0b) * 4) + p1b - q1b
++        vdup.16   d4, r2          @ tc0a, tc0b
++        vdup.16   d6, r12         @ tc1a, tc1b
++        vrshr.s16 q0, #3          @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
++        \I1
++        vrshr.s16 q1, #3          @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
++        \I2
++        vshll.u8  q2, d4, #\bit_depth - 8 @ tc0a, tc0b
++        \I3
++        vshll.u8  q3, d6, #\bit_depth - 8 @ tc1a, tc1b
++        \I4
++        vmin.s16  q0, q2
++        \I5
++        vneg.s16  q2, q2          @ -tc0a, -tc0b
++        \I6
++        vmin.s16  q1, q3
++        \I7
++        vneg.s16  q3, q3          @ -tc1a, -tc1b
++        vmax.s16  q0, q2          @ delta0a
++        vadd.i16  \P0a, q0        @ p0a + delta0a
++        vsub.i16  \Q0a, q0        @ q0a - delta0a
++        vmax.s16  q1, q3          @ delta0b
++        vadd.i16  \P0b, q1        @ p0b + delta0b
++        vsub.i16  \Q0b, q1        @ q0b - delta0b
++        vmov.i16  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        vmax.s16  \P0a, q2
++        vmax.s16  \Q0a, q2
++        vmax.s16  \P0b, q2
++        vmax.s16  \Q0b, q2
++        vmin.s16  \P0a, q3
++        vmin.s16  \Q0a, q3
++        vmin.s16  \P0b, q3
++        vmin.s16  \Q0b, q3
++.endm
++
++
++
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++.macro hevc_loop_filter_luma_start
++        ldr     r12, [r3]
++        ldr      r3, [r3, #4]
++        orrs     r3, r12, r3, lsl #16
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldrd     r4, r5, [sp, #32]      @ &_no_p
++        ldrb     r4, [r4]
++        ldrb     r5, [r5]
++        movs     r10, r4
++        it ne
++        movne    r10, #1
++        cmp      r5, #0
++        it ne
++        orrne    r10, #2
++.endm
++
++@ Input:
++@  r2          beta    (raw: needs shift for bitdepth > 8)
++@  r3[ 0:15]   tc[0]   (raw: needs shift for bitdepth > 8)
++@  r3[16:31]   tc[1]   (raw: needs shift for bitdepth > 8)
++@
++@ Input & output
++@  8-bit: d16-d23      (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
++@ 16-bit:  q8-q15
++@
++@  r1         -r1
++@  r10        b1->C, b0->N  (r10 junk)
++@
++@ Junks:
++@  r5, r6, r7, r8, r9
++
++.macro m_filter_luma bit_depth, Q11, Q15
++.if \bit_depth == 8
++        vmovl.u8    q14, d22      @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
++        vmovl.u8    q13, d21      @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
++        vmovl.u8    q12, d20      @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
++        vmovl.u8    \Q11, d19     @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
++        vmovl.u8    q10, d18      @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
++        vmovl.u8    q9, d17       @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
++.endif
++        vadd.i16    q0, q9, \Q11  @ P2 + P0
++.if \bit_depth > 8
++        lsl         r3, r3, #(\bit_depth - 8)
++.endif
++        vadd.i16    q1, q14, q12  @ Q2 + Q0
++.if \bit_depth > 8
++        lsl         r2, r2, #(\bit_depth - 8)
++.endif
++        vsub.i16    q0, q10       @ P2 - P1 + P0
++        lsr         r5, r3, #16
++        vsub.i16    q1, q13       @ Q2 - Q1 + Q0
++.if \bit_depth == 8
++        vmovl.u8    q8, d16       @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
++        vmovl.u8    \Q15, d23     @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
++.endif
++        vabd.s16    q0, q10       @ dp0 = abs(P2 - 2 * P1 + P0)
++        vabd.s16    q1, q13       @ dq0 = abs(Q2 - 2 * Q1 + Q0)
++        vmov.i64    q2, #0xffffffff0000
++        vbic        q0, q2        @ only dp0(') and dp3(')
++        vbic        q1, q2        @ only dq0(') and dq3(')
++        vsra.u64    q0, #16
++        vsra.u64    q1, #16
++        vdup.16     q3, r2        @ beta
++        vdup.16     d14, r3       @ tC[0]
++        vdup.16     d15, r5       @ tC[1]
++        vabd.s16    q4, q8, \Q11  @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
++        vmovn.i32   d0, q0        @ dp3' dp0' dp3 dp0
++        vmovn.i32   d1, q1        @ dq3' dq0' dq3 dq0
++        vadd.i16    d5, d0, d1    @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
++        vabd.s16    q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
++        vaba.s16    q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
++        vpadd.i16   d2, d5, d5    @ dontcare dontcare d0'+d3' d0+d3
++        vshl.s16    q6, q7, #2    @ tC[] * 4
++        vrhadd.s16  q6, q7        @ tc25 = (tc[] * 5 + 1) >> 1
++        vcgt.s16    d2, d6, d2    @ if (d0 + d3 < beta)
++        vmov        r7, s4        @ (d2) r7 = mask of blocks to apply filtering (16b/block)
++        vshr.s16    q1, q3, #3    @ beta_3 = beta >> 3
++        cmp         r7, #0
++        beq         .Lbypasswrite
++
++        vcgt.s16    q5, q6, q5    @ if < tc25
++        vcgt.s16    q4, q1, q4    @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
++        vand        q4, q5
++        vbic        d8, d4
++        vbic        d9, d4
++        vshr.s16    q3, #2        @ beta_2 = beta >> 2
++        vsra.u64    q4, #16
++        vshl.s16    d5, #1        @ d3'<<1 d0'<<1 d3<<1 d0<<1
++        vshl.i16    q7, #1        @ tc2 = tC[] << 1
++        vcgt.s16    d6, d5        @ if (d3'<<1 < beta_2) etc
++        vmovn.i32   d8, q4        @ beta_3 && tc25 tests, prime block in ms half
++        vand        d6, d8        @ && beta_2 tests, prime in ms half
++        vpadd.i16   d0, d1        @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
++        vneg.s16    q6, q7        @ -tc2
++        vmovn.i32   d8, q3
++        vshrn.i32   d6, q3, #16
++        vand        d6, d8
++        vmov        r5, r6, d0    @ r5 = dp0'+dp3' dp0+dp3  r6 = dq0'+dq3' dq0+dq3
++        vmov        r8, s12       @ (d6) r8 = mask of strong filtering blocks (16b/block)
++        vadd.i16    q0, \Q11, q12 @ p0 + q0
++        ands        r9, r7, r8
++        beq         1f
++
++        vadd.i16    q2, q0, q10   @ p1 + p0 + q0
++        vadd.i16    q3, q0, q13   @ p0 + q0 + q1
++        lsr         r3, r9, #16
++        vadd.i16    q1, q2, q9    @ p2 + p1 + p0 + q0 (new P1 before clipping)
++        vadd.i16    q4, q3, q14   @ p0 + q0 + q1 + q2 (new Q1 before clipping)
++        vadd.i16    q0, q8, q9    @ p3 + p2
++        vadd.i16    q5, \Q15, q14 @ q2 + q3
++        vadd.i16    q2, q1        @ p2 + 2 * p1 + 2 * p0 + 2 * q0
++        vadd.i16    q3, q4        @ 2 * p0 + 2 * q0 + 2 * q1 + q2
++        vshl.i16    q0, #1        @ 2 * p3 + 2 * p2
++        vshl.i16    q5, #1        @ 2 * q2 + 2 * q3
++        vadd.i16    q0, q1        @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
++        vadd.i16    q5, q4        @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
++        vadd.i16    q2, q13       @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
++        vadd.i16    q3, q10       @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
++        vrshr.s16   q0, #3        @ scale, with rounding
++        vrshr.s16   q5, #3
++        vrshr.s16   q1, #2
++        vrshr.s16   q4, #2
++        vrshr.s16   q2, #3
++        vrshr.s16   q3, #3
++        vsub.i16    q0, q9        @ find difference
++        vsub.i16    q5, q14
++        vsub.i16    q1, q10
++        vsub.i16    q4, q13
++        vsub.i16    q2, \Q11
++        vsub.i16    q3, q12
++        vmax.s16    q0, q6        @ clip difference to -tc2 .. tc2
++        vmax.s16    q5, q6
++        vmax.s16    q1, q6
++        vmax.s16    q4, q6
++        vmax.s16    q2, q6
++        vmax.s16    q3, q6
++        vdup.16     d12, r9       @ expand mask, reuse q6 due to register pressure
++        vdup.16     d13, r3
++        vmin.s16    q0, q7
++        vmin.s16    q5, q7
++        vmin.s16    q1, q7
++        vmin.s16    q4, q7
++        vmin.s16    q2, q7
++        vmin.s16    q3, q7
++        vadd.i16    q0, q9        @ apply difference
++        vadd.i16    q5, q14
++        vadd.i16    q1, q10
++        vadd.i16    q4, q13
++        vadd.i16    q2, \Q11
++        vadd.i16    q3, q12
++        vbit        q9, q0, q6    @ apply filtered values according to mask
++        vbit        q14, q5, q6
++        vbit        q10, q1, q6
++        vbit        q13, q4, q6
++        vbit        \Q11, q2, q6
++        vbit        q12, q3, q6
++        vneg.s16    q6, q7        @ restore -tc2
++
++1:
++        bics        r9, r7, r8
++        beq         2f
++
++        vsub.i16    q0, q12, \Q11 @ q0 - p0
++        vsub.i16    q1, q13, q10  @ q1 - p1
++        lsr         r3, r9, #16
++        vshl.i16    q2, q0, #3
++        lsr         r7, r5, #16
++        vadd.i16    q3, q0, q2    @ 9 * (q0 - p0)
++        lsr         r8, r6, #16
++        vshl.i16    q2, q1, #1
++        vadd.i16    q4, q1, q2    @ 3 * (q1 - p1)
++        vshr.s16    q6, #1        @ -tc = -tc2 >> 1
++        vsub.i16    q5, q3, q4
++        vrhadd.s16  q1, q9, \Q11  @ (p2 + p0 + 1) >> 1
++        vrhadd.s16  q3, q14, q12  @ (q2 + q0 + 1) >> 1
++        vrshr.s16   q5, #4        @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
++        vsub.i16    q1, q10       @ ((p2 + p0 + 1) >> 1) - p1
++        vsub.i16    q3, q13       @ ((q2 + q0 + 1) >> 1) - q1
++        vmax.s16    q6, q5        @
++        vshr.s16    q4, q7, #1    @ tc = tc2 >> 1
++        vdup.16     q0, r2        @ beta
++        vmin.s16    q6, q4        @ delta0 clamped to [-tc, tc]
++        vshr.s16    q4, #1        @ tc_2 = tc >> 1
++        vhadd.s16   q1, q6        @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
++        vhsub.s16   q3, q6        @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
++        vshr.s16    q2, q0, #1    @ beta >> 1
++        vadd.i16    q2, q0        @ beta + (beta >> 1)
++        vneg.s16    q0, q4        @ -tc_2
++        vabs.s16    q5, q5        @ abs(original delta0)
++        vshr.s16    q2, #3        @ (beta + (beta >> 1)) >> 3
++        vmax.s16    q1, q0
++        vmax.s16    q3, q0
++        vshl.s16    q0, q7, #2    @ 8 * tc
++        vadd.i16    q7, q0        @ 10 * tc
++        vdup.16     d0, r9
++        vdup.16     d1, r3        @ q0 = mask of blocks to apply filtering
++        vmin.s16    q1, q4        @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
++        vmin.s16    q3, q4        @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
++        vdup.16     d8, r5        @ dp0 + dp3
++        vdup.16     d9, r7        @ dp0' + dp3'
++        vcgt.s16    q7, q5        @ if ((10 * tc) > abs(delta0))
++        vdup.16     d10, r6       @ dq0 + dq3
++        vdup.16     d11, r8       @ dq0' + dq3'
++        vand        q7, q0        @ AND block and line masks
++        vcgt.s16    q4, q2, q4    @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
++        vadd.i16    q0, q1, q10   @ p1 + deltap1
++        vcgt.s16    q5, q2, q5    @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
++        vadd.i16    q3, q3, q13   @ q1 + deltaq1
++        vadd.i16    q1, \Q11, q6  @ p0 + delta0
++        vsub.i16    q2, q12, q6   @ q0 - delta0
++        vand        q4, q7        @ AND nd_p test with block/line masks
++        vand        q5, q7        @ AND nd_q test with block/line masks
++        vbit        q10, q0, q4
++        vbit        \Q11, q1, q7
++        vbit        q12, q2, q7
++        vbit        q13, q3, q5
++
++2:
++.if \bit_depth == 8
++        vmovn.i16 d16, q8
++        vmovn.i16 d23, \Q15
++        neg       r1, r1
++        vqmovun.s16 d17, q9
++        vqmovun.s16 d18, q10
++        vqmovun.s16 d19, \Q11
++        lsls      r10, #31
++        vqmovun.s16 d20, q12
++        vqmovun.s16 d21, q13
++        vqmovun.s16 d22, q14
++.else
++        vmov.i16  q0, #0
++        vmov.i16  q1, #(1 << \bit_depth - 1)
++        @ q8 & q15 should be unaltered and so don't require clipping
++        neg       r1, r1
++        vmax.s16  q9,  q0
++        vmax.s16  q10, q0
++        vmax.s16  q11, q0
++        vmax.s16  q12, q0
++        vmax.s16  q13, q0
++        vmax.s16  q14, q0
++        lsls      r10, #31
++        vmin.s16  q9,  q1
++        vmin.s16  q10, q1
++        vmin.s16  q11, q1
++        vmin.s16  q12, q1
++        vmin.s16  q13, q1
++        vmin.s16  q14, q1
++.endif
++        bx        lr
++.endm
++
++function hevc_loop_filter_luma_body
++        m_filter_luma 8, q15, q11
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
++@   uint8_t *_pix,      [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int *_tc,           [r3]
++@   uint8_t *_no_p,     [sp+0]
++@   uint8_t *_no_q)     [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
++        hevc_loop_filter_luma_start
++
++        sub      r4, r0, #4
++        b        .Lv_loop_luma_common
++endfunc
++
++@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
++@   uint8_t * pix_r,    [r0]
++@   ptrdiff_t _stride,  [r1]
++@   int _beta,          [r2]
++@   int tc2,            [r3]
++@   int no_f,           [sp+0]
++@   uint8_t * pix_l)    [sp+4]
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r4, [sp, #36]
++        ldr      r10, [sp, #32]
++
++.Lv_loop_luma_common:
++        vpush    {d8-d15}
++
++        @ It's slightly faster to do unlaned loads and transpose in the
++        @ 8-bit case, even though it needs more instructions, because
++        @ VLD4.8 is a really slow way to read from memory.
++        vld1.32 {d16[0]}, [r4:32], r1
++        vld1.32 {d20[0]}, [r0:32], r1
++        vld1.32 {d16[1]}, [r4:32], r1
++        vld1.32 {d20[1]}, [r0:32], r1
++        vld1.32 {d17[0]}, [r4:32], r1
++        vld1.32 {d21[0]}, [r0:32], r1
++        vld1.32 {d17[1]}, [r4:32], r1
++        vld1.32 {d21[1]}, [r0:32], r1
++        vld1.32 {d18[0]}, [r4:32], r1
++        vld1.32 {d22[0]}, [r0:32], r1
++        vld1.32 {d18[1]}, [r4:32], r1
++        vld1.32 {d22[1]}, [r0:32], r1
++        vld1.32 {d19[0]}, [r4:32], r1
++        vld1.32 {d23[0]}, [r0:32], r1
++        vld1.32 {d19[1]}, [r4:32]
++        vld1.32 {d23[1]}, [r0:32]
++        vuzp.16 q8, q9
++        vuzp.16 q10, q11
++        vuzp.8  q8, q9
++        vuzp.8  q10, q11
++        vswp    d17, d18
++        vswp    d21, d22
++
++        bl hevc_loop_filter_luma_body
++
++        add     r6, r4, r1
++        add     r2, r0, r1
++        lsl     r1, #1
++
++        vpop     {d8-d15}
++
++        @ no_p[1]
++        bmi     1f
++        vst4.8  {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
++        vst4.8  {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
++        vst4.8  {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
++        vst4.8  {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
++
++        vst4.8  {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
++        vst4.8  {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
++        vst4.8  {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
++        vst4.8  {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
++1:
++        @ no_q[1]
++        bcs     1f
++        vst4.8  {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
++        vst4.8  {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
++        vst4.8  {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
++        vst4.8  {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
++
++        vst4.8  {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
++        vst4.8  {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
++        vst4.8  {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
++        vst4.8  {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
++1:
++        pop      {r4-r10,pc}
++
++.Lbypasswrite:
++        vpop     {d8-d15}
++        pop      {r4-r10,pc}
++endfunc
++
++.macro m_filter_v_luma_16 bit_depth
++        vpush    {d8-d15}
++
++        @ Uses slightly fewer instructions to do laned loads than unlaned
++        @ and transpose.  This also means that we can use the same code for
++        @ both split & unsplit deblock
++        vld4.16  {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
++        vld4.16  {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
++
++        vld4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vld4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++
++        vld4.16  {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
++        vld4.16  {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
++
++        vld4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vld4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++
++        vld4.16  {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
++        vld4.16  {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
++
++        vld4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vld4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++
++        vld4.16  {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
++        vld4.16  {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
++
++        vld4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4]
++        vld4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        add      r6, r4, r1
++        add      r2, r0, r1
++        lsl      r1, #1
++
++        vpop     {d8-d15}
++
++        @ p[1]
++        bmi      1f
++        vst4.16  {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
++        vst4.16  {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
++        vst4.16  {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
++        vst4.16  {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
++        vst4.16  {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
++        vst4.16  {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
++        vst4.16  {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
++        vst4.16  {d16[0], d18[0], d20[0], d22[0]}, [r6]
++1:
++        @ q[1]
++        bcs      1f
++        vst4.16  {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
++        vst4.16  {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
++        vst4.16  {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
++        vst4.16  {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
++        vst4.16  {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
++        vst4.16  {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
++        vst4.16  {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
++        vst4.16  {d24[0], d26[0], d28[0], d30[0]}, [r2]
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
++
++
++@ void (*hevc_h_loop_filter_luma)(uint8_t *pix,     [r0]
++@                                 ptrdiff_t stride, [r1]
++@                                 int beta,         [r2]
++@                                 int32_t *tc,      [r3]
++@                                 uint8_t *no_p,    sp[0]
++@                                 uint8_t *no_q);   sp[4]
++@
++@ Src should always be on 8 byte boundry & all in the same slice
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
++        hevc_loop_filter_luma_start
++        b        .Lh_loop_filter_luma_common_8
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r10, [sp, #32]
++
++.Lh_loop_filter_luma_common_8:
++        sub      r4, r0, r1, lsl #2
++        add      r0, r4, r1
++        lsl      r1, #1
++        vpush    {d8-d15}
++
++        vld1.8  {d16}, [r4], r1
++        vld1.8  {d17}, [r0], r1
++        vld1.8  {d18}, [r4], r1
++        vld1.8  {d19}, [r0], r1
++        vld1.8  {d20}, [r4], r1
++        vld1.8  {d21}, [r0], r1
++        vld1.8  {d22}, [r4]
++        vld1.8  {d23}, [r0]
++
++        bl hevc_loop_filter_luma_body
++
++        add      r0, r0, r1, lsl #1
++        add      r2, r4, r1, lsl #1
++        add      r6, r4, r1, asr #1
++        vpop     {d8-d15}
++
++        @ P2-P0
++        bcs      1f
++        vst1.8   {d22}, [r4], r1
++        vst1.8   {d21}, [r6]
++        vst1.8   {d20}, [r4]
++1:
++        @ Q0-Q2
++        bmi      1f
++        vst1.8   {d19}, [r0], r1
++        vst1.8   {d18}, [r2]
++        vst1.8   {d17}, [r0]
++1:
++        pop      {r4-r10,pc}
++endfunc
++
++
++.macro m_filter_h_luma_16 bit_depth
++        sub      r4, r0, r1, lsl #2
++        add      r0, r4, r1
++        lsl      r1, #1
++        vpush    {d8-d15}
++
++        vld1.16 { q8}, [r4], r1
++        vld1.16 { q9}, [r0], r1
++        vld1.16 {q10}, [r4], r1
++        vld1.16 {q11}, [r0], r1
++        vld1.16 {q12}, [r4], r1
++        vld1.16 {q13}, [r0], r1
++        vld1.16 {q14}, [r4]
++        vld1.16 {q15}, [r0]
++
++        bl hevc_loop_filter_luma_body_\bit_depth
++
++        add      r0, r0, r1, lsl #1
++        add      r2, r4, r1, lsl #1
++        add      r6, r4, r1, asr #1
++        vpop     {d8-d15}
++
++        @ P2-P0
++        bcs      1f
++        vst1.16  {q14}, [r4], r1
++        vst1.16  {q13}, [r6]
++        vst1.16  {q12}, [r4]
++1:
++        bmi      1f
++        vst1.16  {q11}, [r0], r1
++        vst1.16  {q10}, [r2]
++        vst1.16  { q9}, [r0]
++1:
++        pop      {r4-r10,pc}
++.endm
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r,        // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no_f
++@ 0  tl P0
++@ 1  tr P1
++@ 2  bl Q0
++@ 3  br Q1
++@
++@ Probably not worth having the P/Qa only special case in this direction
++@ Given layout we won't save any memory reads or avoid any cache dirtying
++@ We would save a bit of computation but I expect the partials to be less
++@ common in the H direction than V due to how we arrange deblock.
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
++        sub      r12, r0, r1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        vld1.8   {d26,d27}, [r0]
++        lsl      r1, #1
++        sub      r0, r1
++        vld1.8   {d18,d19}, [r12], r1
++        vld1.8   {d16,d17}, [r0], r1
++        vld1.8   {d28,d29}, [r12]
++
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
++        "sub      r12, r0, r1, asr #1"
++
++        lsls     r3, #29                @ b2 -> N, b3 -> C
++        it pl
++        vstrpl   d26, [r0, #0]
++        it cc
++        vstrcc   d27, [r0, #8]
++        lsls     r3, #2                 @ b0 -> N, b1 -> C
++        it pl
++        vstrpl   d18, [r12, #0]
++        it cc
++        vstrcc   d19, [r12, #8]
++        bx       lr
++
++endfunc
++
++
++@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r,     // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     unsigned int no_f);    // r3
++@
++@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
++@
++@ Macro here actual function near bottom
++
++.macro m_filter_h_uv_16 bit_depth
++        sub      r12, r0, r1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        vld1.16  {q12, q13}, [r0]
++        lsl      r1, #1
++        sub      r0, r1
++        vld1.16  {q10, q11}, [r12], r1
++        vld1.16  {q8,  q9 }, [r0], r1
++        vld1.16  {q14, q15}, [r12]
++
++        hevc_loop_filter_uv_body2_16  q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
++        "sub      r12, r0, r1, asr #1", \
++        "cmp      r3, #0"
++
++        bne      1f
++        vst1.16  {q10, q11}, [r12]
++        vst1.16  {q12, q13}, [r0]
++        bx       lr
++
++        @ At least one no_f bit is set
++        @ Which means we need to break this apart in an ugly fashion
++1:
++        lsls     r3, #29                @ b2 -> N, b3 -> C
++        itt pl
++        vstrpl   d24, [r0, #0]
++        vstrpl   d25, [r0, #8]
++        itt cc
++        vstrcc   d26, [r0, #16]
++        vstrcc   d27, [r0, #24]
++        lsls     r3, #2                 @ b0 -> N, b1 -> C
++        itt pl
++        vstrpl   d20, [r12, #0]
++        vstrpl   d21, [r12, #8]
++        itt cc
++        vstrcc   d22, [r12, #16]
++        vstrcc   d23, [r12, #24]
++        bx       lr
++.endm
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++@ no_f:
++@ 0  tl P0
++@ 1  tr Q0
++@ 2  bl P1
++@ 3  br Q1
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        push     {lr}
++        vld2.16  {d16[0], d18[0]}, [r3], r1
++        vld2.16  {d20[0], d22[0]}, [r0], r1
++
++        cmp      r2, #0x10000
++        vld2.16  {d16[1], d18[1]}, [r3], r1
++        vld2.16  {d20[1], d22[1]}, [r0], r1
++
++        vld2.16  {d16[2], d18[2]}, [r3], r1
++        vld2.16  {d20[2], d22[2]}, [r0], r1
++
++        vld2.16  {d16[3], d18[3]}, [r3], r1
++        vld2.16  {d20[3], d22[3]}, [r0], r1
++        blo      10f
++
++        vld2.16  {d17[0], d19[0]}, [r3], r1
++        vld2.16  {d21[0], d23[0]}, [r0], r1
++
++        sub      ip, r0, r3
++        vld2.16  {d17[1], d19[1]}, [r3], r1
++        vld2.16  {d21[1], d23[1]}, [r0], r1
++
++        cmp      ip, #4
++        vld2.16  {d17[2], d19[2]}, [r3], r1
++        vld2.16  {d21[2], d23[2]}, [r0], r1
++
++        vld2.16  {d17[3], d19[3]}, [r3]
++        vld2.16  {d21[3], d23[3]}, [r0]
++
++        hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
++        "ldr      lr, [sp, #4]", \
++        "neg      r1, r1",       \
++        "it eq; cmpeq lr, #0",   \
++        "add      r3, #2",       \
++        "add      ip, r3, r1",   \
++        "add      r2, r0, r1",   \
++        "lsl      r1, #1"
++
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 4 and no_f == 0
++@ so it is worth having this special case
++        vst2.16   {d19[3], d21[3]}, [r3], r1    @ P0b, Q0b
++        vst2.16   {d19[2], d21[2]}, [ip], r1
++        vst2.16   {d19[1], d21[1]}, [r3], r1
++        vst2.16   {d19[0], d21[0]}, [ip], r1
++        vst2.16   {d18[3], d20[3]}, [r3], r1    @ P0a, Q0a
++        vst2.16   {d18[2], d20[2]}, [ip], r1
++        vst2.16   {d18[1], d20[1]}, [r3]
++        vst2.16   {d18[0], d20[0]}, [ip]
++        pop       {pc}
++
++@ Either split or partial
++1:
++        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++        ittt cs
++        addcs    r0, r0, r1, lsl #1
++        addcs    r2, r2, r1, lsl #1
++        bcs      1f
++        @ Q0b
++        vst1.16  {d21[3]}, [r0], r1
++        vst1.16  {d21[2]}, [r2], r1
++        vst1.16  {d21[1]}, [r0], r1
++        vst1.16  {d21[0]}, [r2], r1
++1:
++        ittt mi
++        addmi    r3, r3, r1, lsl #1
++        addmi    ip, ip, r1, lsl #1
++        bmi      1f
++        @ P0b
++        vst1.16  {d19[3]}, [r3], r1
++        vst1.16  {d19[2]}, [ip], r1
++        vst1.16  {d19[1]}, [r3], r1
++        vst1.16  {d19[0]}, [ip], r1
++1:
++        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++        bcs      1f
++        @ Q0a
++        vst1.16  {d20[3]}, [r0], r1
++        vst1.16  {d20[2]}, [r2], r1
++        vst1.16  {d20[1]}, [r0]
++        vst1.16  {d20[0]}, [r2]
++1:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.16  {d18[3]}, [r3], r1
++        vst1.16  {d18[2]}, [ip], r1
++        vst1.16  {d18[1]}, [r3]
++        vst1.16  {d18[0]}, [ip]
++        pop      {pc}
++
++@ Single lump (rather than double)
++10:
++        @ As we have post inced r0/r3 in the load the easiest thing to do is
++        @ to subtract and write forwards, rather than backwards (as above)
++        @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++        hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
++        "ldr      lr, [sp, #4]",       \
++        "add      r3, #2",             \
++        "sub      r0, r0, r1, lsl #2", \
++        "sub      r3, r3, r1, lsl #2", \
++        "lsls     lr, #31",            \
++        "add      r2, r0, r1",         \
++        "add      ip, r3, r1",         \
++        "lsl      r1, #1"
++
++        bcs      3f
++        @ Q0a
++        vst1.16  {d20[0]}, [r0], r1
++        vst1.16  {d20[1]}, [r2], r1
++        vst1.16  {d20[2]}, [r0]
++        vst1.16  {d20[3]}, [r2]
++3:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.16  {d18[0]}, [r3], r1
++        vst1.16  {d18[1]}, [ip], r1
++        vst1.16  {d18[2]}, [r3]
++        vst1.16  {d18[3]}, [ip]
++        pop      {pc}
++
++endfunc
++
++
++@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r,       // r0
++@                                     unsigned int stride,   // r1
++@                                     uint32_t tc4,          // r2
++@                                     uint8_t * src_l,       // r3
++@                                     unsigned int no_f);   // sp[0]
++@
++
++@ no_f
++@ 0  tl P0a
++@ 1  tr Q0a
++@ 2  bl P0b
++@ 3  br Q0b
++
++@ P1: q8,  q12
++@ P0: q9,  q13
++@ Q0: q10, q14
++@ Q1: q11, q15
++
++.macro m_filter_v_uv2_16 bit_depth
++        cmp      r2, #0
++        it eq
++        bxeq     lr
++        push     {lr}
++        vld2.32  {d16[0], d18[0]}, [r3], r1
++        vld2.32  {d20[0], d22[0]}, [r0], r1
++
++        cmp      r2, #0x10000
++        vld2.32  {d16[1], d18[1]}, [r3], r1
++        vld2.32  {d20[1], d22[1]}, [r0], r1
++
++        vld2.32  {d17[0], d19[0]}, [r3], r1
++        vld2.32  {d21[0], d23[0]}, [r0], r1
++
++        vld2.32  {d17[1], d19[1]}, [r3], r1
++        vld2.32  {d21[1], d23[1]}, [r0], r1
++        blo      10f
++
++        vld2.32  {d24[0], d26[0]}, [r3], r1
++        vld2.32  {d28[0], d30[0]}, [r0], r1
++
++        sub      ip, r0, r3
++        vld2.32  {d24[1], d26[1]}, [r3], r1
++        vld2.32  {d28[1], d30[1]}, [r0], r1
++
++        cmp      ip, #8
++        vld2.32  {d25[0], d27[0]}, [r3], r1
++        vld2.32  {d29[0], d31[0]}, [r0], r1
++
++        vld2.32  {d25[1], d27[1]}, [r3]
++        vld2.32  {d29[1], d31[1]}, [r0]
++
++        hevc_loop_filter_uv_body2_16  q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
++        "ldr      lr, [sp, #4]", \
++        "neg      r1, r1",       \
++        "it eq; cmpeq lr, #0",   \
++        "add      r3, #4",       \
++        "add      ip, r3, r1",   \
++        "add      r2, r0, r1",   \
++        "lsl      r1, #1"
++
++        bne      1f
++
++@ Much/most of the time r0 == r3 + 8 and no_f == 0
++@ so it is worth having this special case
++        vst2.32   {d27[1], d29[1]}, [r3], r1    @ P0b, Q0b
++        vst2.32   {d27[0], d29[0]}, [ip], r1
++        vst2.32   {d26[1], d28[1]}, [r3], r1
++        vst2.32   {d26[0], d28[0]}, [ip], r1
++        vst2.32   {d19[1], d21[1]}, [r3], r1    @ P0a, Q0a
++        vst2.32   {d19[0], d21[0]}, [ip], r1
++        vst2.32   {d18[1], d20[1]}, [r3]
++        vst2.32   {d18[0], d20[0]}, [ip]
++        pop       {pc}
++
++@ Either split or partial
++1:
++        lsls     lr, #29               @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
++        ittt cs
++        addcs    r0, r0, r1, lsl #1
++        addcs    r2, r2, r1, lsl #1
++        bcs      1f
++        @ Q0b
++        vst1.32  {d29[1]}, [r0], r1
++        vst1.32  {d29[0]}, [r2], r1
++        vst1.32  {d28[1]}, [r0], r1
++        vst1.32  {d28[0]}, [r2], r1
++1:
++        ittt mi
++        addmi    r3, r3, r1, lsl #1
++        addmi    ip, ip, r1, lsl #1
++        bmi      1f
++        @ P0b
++        vst1.32  {d27[1]}, [r3], r1
++        vst1.32  {d27[0]}, [ip], r1
++        vst1.32  {d26[1]}, [r3], r1
++        vst1.32  {d26[0]}, [ip], r1
++1:
++        lsls     lr, #2                @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
++        bcs      1f
++        @ Q0a
++        vst1.32  {d21[1]}, [r0], r1
++        vst1.32  {d21[0]}, [r2], r1
++        vst1.32  {d20[1]}, [r0]
++        vst1.32  {d20[0]}, [r2]
++1:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.32  {d19[1]}, [r3], r1
++        vst1.32  {d19[0]}, [ip], r1
++        vst1.32  {d18[1]}, [r3]
++        vst1.32  {d18[0]}, [ip]
++        pop      {pc}
++
++@ Single lump (rather than double)
++10:
++        @ As we have post inced r0/r3 in the load the easiest thing to do is
++        @ to subtract and write forwards, rather than backwards (as above)
++        @ b0 (P0a) -> N, b1 (Q0a) -> C
++
++        hevc_loop_filter_uv_body1_16  q8, q9, q10, q11, \bit_depth, \
++        "ldr      lr, [sp, #4]",       \
++        "add      r3, #4",             \
++        "sub      r0, r0, r1, lsl #2", \
++        "sub      r3, r3, r1, lsl #2", \
++        "lsls     lr, #31",            \
++        "add      r2, r0, r1",         \
++        "add      ip, r3, r1",         \
++        "lsl      r1, #1"
++
++        bcs      3f
++        @ Q0a
++        vst1.32  {d20[0]}, [r0], r1
++        vst1.32  {d20[1]}, [r2], r1
++        vst1.32  {d21[0]}, [r0]
++        vst1.32  {d21[1]}, [r2]
++3:
++        it       mi
++        popmi    {pc}
++        @ P0a
++        vst1.32  {d18[0]}, [r3], r1
++        vst1.32  {d18[1]}, [ip], r1
++        vst1.32  {d19[0]}, [r3]
++        vst1.32  {d19[1]}, [ip]
++        pop      {pc}
++.endm
++
++
++@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
++@ But in real world testing it is ~20% slower, presumably due to code size
++
++#if 0 // NEON version
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ *                                            int in_inc0, int in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++        mov         ip, sp
++        push        {a1-a3,v1-v8,lr}
++        ldm         ip, {v1-v6}
++        cmp         a1, #2
++        bls         2f
++        vpush       {d8-d13}
++        sub         v5, v5, #10
++        sub         v6, v6, #10
++1:
++        vld2.32     {d0[0], d2[0]}, [a3]!
++        vld2.32     {d4[0], d6[0]}, [a4]!
++          vmov.u8     q12, #0
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        v8, [a3], #1
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[0]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[0]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d16[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d20[0]}, [ip]
++        vld1.32     {d18[0]}, [v8]
++        vld1.32     {d22[0]}, [lr]
++
++        vld2.32     {d0[1], d2[1]}, [a3]!
++        vld2.32     {d4[1], d6[1]}, [a4]!
++        ldrb        a2, [a3], #1
++          vmov.u16    d12, #1
++        ldrb        ip, [a4], #1
++          vmov.u16    d13, #2
++        ldrb        v8, [a3], #1
++          vmov.u16    d27, #4
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[2]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[2]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d16[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d20[1]}, [ip]
++        vld1.32     {d18[1]}, [v8]
++        vld1.32     {d22[1]}, [lr]
++
++        vld2.32     {d1[0], d3[0]}, [a3]!
++        vld2.32     {d5[0], d7[0]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        lr, [a4], #1
++        ldrb        v8, [a3], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[4]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[4]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d17[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d21[0]}, [ip]
++        vld1.32     {d19[0]}, [v8]
++        vld1.32     {d23[0]}, [lr]
++
++        vld2.32     {d1[1], d3[1]}, [a3]!
++        vld2.32     {d5[1], d7[1]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        v8, [a3], #1
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d24[6]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d25[6]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d17[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d21[1]}, [ip]
++        vld1.32     {d19[1]}, [v8]
++        vld1.32     {d23[1]}, [lr]
++
++        @ So now we have:
++        @ q0.32[i]  = curr[i].mv[0]
++        @ q1.32[i]  = curr[i].mv[1]
++        @ q2.32[i]  = neigh[i].mv[0]
++        @ q3.32[i]  = neigh[i].mv[1]
++        @ q8.32[i]  = curr_rpl0[curr[i].ref_idx[0]]
++        @ q9.32[i]  = curr_rpl1[curr[i].ref_idx[1]]
++        @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++        @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++        @ d24.16[i] = curr[i].pred_flag
++        @ d25.16[i] = neigh[i].pred_flag
++
++        vtst.16     d28, d24, d12
++        vtst.16     d29, d24, d13
++        vadd.i16    d8, d24, d12
++        vadd.i16    d9, d25, d12
++        vtst.16     d30, d25, d12
++        vtst.16     d31, d25, d13
++        veor        d26, d8, d9
++          ldr         lr, [sp, 6*8 + 1*4]
++        vmovl.s16   q4, d28
++        vmovl.s16   q5, d29
++          teq         lr, #1
++        vmovl.s16   q14, d30
++          it ne
++          lslne       v1, lr, #1
++        vmovl.s16   q15, d31
++          it ne
++          rsbne       v2, v1, #32
++        vbif        q0, q1, q4
++        vbif        q2, q3, q14
++        vbif        q1, q0, q5
++        vbif        q3, q2, q15
++        vabd.s16    q12, q0, q2
++        vabd.s16    q2, q1
++        vabd.s16    q0, q3
++        vabd.s16    q1, q3
++        vbif        q8, q9, q4
++        vbif        q10, q11, q14
++        vbif        q9, q8, q5
++        vbif        q11, q10, q15
++        vclt.u16    d6, d24, d27
++        vclt.u16    d8, d2, d27
++        vclt.u16    d7, d25, d27
++        vclt.u16    d9, d3, d27
++        vclt.u16    d2, d0, d27
++        vclt.u16    d0, d4, d27
++        vclt.u16    d3, d1, d27
++        vclt.u16    d1, d5, d27
++        vceq.i32    q12, q10, q8
++        vceq.i32    q10, q9
++        vceq.i32    q8, q11
++        vceq.i32    q9, q11
++        vshrn.i32   d6, q3, #8
++        vshrn.i32   d7, q4, #8
++        vshrn.i32   d8, q1, #8
++        vshrn.i32   d9, q0, #8
++        vmovn.i32   d4, q12
++        vmovn.i32   d2, q10
++        vmovn.i32   d3, q8
++        vmovn.i32   d5, q9
++        vand        q2, q3
++        vrev16.8    q3, q3
++        vand        q2, q3
++        vand        q1, q4
++        vrev16.8    q4, q4
++        vand        q1, q4
++        vand        d4, d5
++        vand        d2, d3
++        vbic        d0, d12, d4
++        vshr.u16    d26, #2
++        vbic        d0, d2
++        vmov.i16    d1, #0x5555
++        vorr        d0, d26
++          bne         10f
++
++        @ Merge results into result word, no duplicates
++        vmov        a2, s0
++        vmov        v8, s1
++        vmov.u16    ip, d0[1]
++        vmov.u16    lr, d0[3]
++        lsl         a2, #30
++        lsl         v8, #30
++        lsl         ip, #30
++        lsl         lr, #30
++        orr         a2, ip, a2, lsr #2
++        orr         v8, lr, v8, lsr #2
++        orr         a2, v8, a2, lsr #4
++        subs        a1, #4
++        orr         v7, a2, v7, lsr #8
++        bhi         1b
++
++        mov         a1, #32
++        ldr         a3, [sp, #6*8]
++        vpop        {d8-d13}
++        sub         a1, a1, a3, lsl #1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
++10:
++        @ Merge results into result word, with duplicates
++        vmul.i16    d0, d1
++        vmov        a2, s0
++        vmov        v8, s1
++        vmov.u16    ip, d0[1]
++        vmov.u16    lr, d0[3]
++        lsl         a2, v2
++        subs        a1, #4
++        lsl         v8, v2
++        lsl         ip, v2
++        lsl         lr, v2
++        ldr         v2, [sp, #6*8 + 12*4 + 1*4]
++T       lsr         a2, v1
++T       orr         a2, ip, a2
++A       orr         a2, ip, a2, lsr v1
++        lsl         ip, v1, #1
++T       lsr         v8, v1
++T       orr         v8, lr, v8
++A       orr         v8, lr, v8, lsr v1
++        lsl         lr, v1, #2
++T       lsr         a2, ip
++T       orr         a2, v8, a2
++A       orr         a2, v8, a2, lsr ip
++        ldr         v1, [sp, #6*8 + 12*4]
++T       lsr         v7, lr
++T       orr         v7, a2, v7
++A       orr         v7, a2, v7, lsr lr
++        bhi         1b
++
++        mov         a1, #32
++        ldrd        a3, a4, [sp, #6*8]
++        vpop        {d8-d13}
++        mls         a1, a3, a4, a1
++        mls         a1, a3, a4, a1
++        mov         a1, v7, lsr a1
++        pop         {a2-a4,v1-v8,pc}
++
++
++2:
++        sub         v5, v5, #10
++        sub         v6, v6, #10
++        vmov.u8     d16, #0
++        blo         3f
++        vld2.32     {d0[0], d1[0]}, [a3]!
++        vld2.32     {d2[0], d3[0]}, [a4]!
++        ldrb        a2, [a3], #1
++        ldrb        ip, [a4], #1
++        ldrb        lr, [a4], #1
++        ldrb        v8, [a3], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d16[0]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d16[4]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d4[0]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d5[0]}, [ip]
++        vld1.32     {d6[0]}, [v8]
++        vld1.32     {d7[0]}, [lr]
++
++3:
++        vld2.32     {d0[1], d1[1]}, [a3]!
++        vld2.32     {d2[1], d3[1]}, [a4]!
++        ldrb        a2, [a3], #1
++          vmov.u16    d17, #1
++        ldrb        ip, [a4], #1
++          vmov.u16    d18, #2
++        ldrb        v8, [a3], #1
++          vmov.u16    d19, #4
++        ldrb        lr, [a4], #1
++        add         a2, v1, a2, lsl #2
++        vld1.8      {d16[2]}, [a3], v5
++        add         ip, v3, ip, lsl #2
++        vld1.8      {d16[6]}, [a4], v6
++        add         v8, v2, v8, lsl #2
++        vld1.32     {d4[1]}, [a2]
++        add         lr, v4, lr, lsl #2
++        vld1.32     {d5[1]}, [ip]
++        vld1.32     {d6[1]}, [v8]
++        vld1.32     {d7[1]}, [lr]
++
++        @ So now we have:
++        @ d0.32[i]  = curr[i].mv[0]
++        @ d1.32[i]  = curr[i].mv[1]
++        @ d2.32[i]  = neigh[i].mv[0]
++        @ d3.32[i]  = neigh[i].mv[1]
++        @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
++        @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
++        @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
++        @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
++        @ d16.16[i] = curr[i].pred_flag
++        @ d16.16[2+i] = neigh[i].pred_flag
++
++        vtst.16     d20, d16, d17
++        vtst.16     d22, d16, d18
++        vadd.i16    d30, d16, d17
++        vswp        d2, d3
++        ldr         lr, [sp, #1*4]
++        vmovl.s16   q10, d20
++          teq         lr, #1
++        vmovl.s16   q11, d22
++          it ne
++          lslne       v1, lr, #1
++        vbif        d0, d1, d20
++        vbif        d4, d6, d20
++        vbif        d3, d2, d21
++        vbif        d5, d7, d21
++        vbif        d1, d0, d22
++        vbif        d6, d4, d22
++        vbif        d2, d3, d23
++        vbif        d7, d5, d23
++        vshr.u16    d30, #2
++        vabd.s16    d24, d0, d3
++        vabd.s16    d25, d1, d2
++        vabd.s16    q0, q0, q1
++        vceq.i32    d2, d4, d5
++        vceq.i32    d20, d5, d6
++        vceq.i32    d21, d4, d7
++        vceq.i32    d3, d6, d7
++        vclt.u16    d6, d24, d19
++        vclt.u16    d7, d25, d19
++        vclt.u16    d22, d1, d19
++        vclt.u16    d23, d0, d19
++        vshrn.i32   d6, q3, #8
++        vmovn.i32   d2, q1
++        vshrn.i32   d7, q11, #8
++        vmovn.i32   d3, q10
++        vand        q0, q3, q1
++          it ne
++          rsbne       v2, v1, #32
++        vrev16.8    q3, q3
++        vand        q0, q3
++        vsra.u64    d30, #32
++        vshr.u64    q1, q0, #32
++        vand        q0, q1
++        vbic        d0, d17, d0
++        vand        d30, d30, d17
++        vbic        d0, d1
++        vmov.i16    d1, #0x5555
++        vorr        d0, d30
++          bne         10f
++
++        @ Construct result word, no duplicates
++        cmp         a1, #2
++        vmov.u16    a1, d0[1]
++        vmov.u16    a2, d0[0]
++        it eq
++        orreq       a1, a2, a1, lsl #2
++        pop         {a2-a4,v1-v8,pc}
++10:
++        @ Construct result word, with duplicates
++        cmp         a1, #2
++        vmul.i16    d0, d1
++        vmov        a2, s0
++        vmov.u16    a1, d0[1]
++        lsl         a2, #16
++        pkhbt       a1, a1, a1, lsl #16
++        lsr         a2, v2
++        lsr         a1, v2
++T       itt eq
++T       lsleq       a1, v1
++T       orreq       a1, a2, a1
++A       orreq       a1, a2, a1, lsl v1
++        pop         {a2-a4,v1-v8,pc}
++endfunc
++
++
++
++#else // non-NEON version
++
++
++/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++ *                                            const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++ *                                            int in_inc0, in_inc1)
++ */
++function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
++        add         ip, sp, #4*4
++        push        {a2-a4,v1-v8,lr}
++        mov         v6, #32
++1:      ldmdb       ip, {v1-v4}
++        ldrsb       v5, [a3, #8]    @ curr->ref_idx
++        ldrsb       v8, [a3, #9]
++        ldrsb       ip, [a4, #8]    @ neigh->ref_idx
++        ldrsb       lr, [a4, #9]
++        ldr         v1, [v1, v5, lsl #2]
++        ldrb        v5, [a3, #10]   @ curr->pred_flag
++        ldr         v2, [v2, v8, lsl #2]
++        ldrb        v8, [a4, #10]   @ neigh->pred_flag
++        ldr         v3, [v3, ip, lsl #2]
++        ldr         v4, [v4, lr, lsl #2]
++        teq         v5, #3
++        beq         20f
++        teq         v8, #3
++        beq         90f
++
++        tst         v5, #1
++        itee        ne
++        ldrne       v5, [a3, #0]    @ curr->mv[0]
++        moveq       v1, v2
++        ldreq       v5, [a3, #4]    @ curr->mv[1]
++        tst         v8, #1
++        itee        ne
++        ldrne       v8, [a4, #0]    @ neigh->mv[0]
++        moveq       v3, v4
++        ldreq       v8, [a4, #4]    @ neigh->mv[1]
++        teq         v1, v3
++        bne         10f
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v8, v5
++        ssub16      v5, v5, v8
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        @ drop through
++10:     it          ne
++        movne       v5, #1<<30
++11:
++        sub         v6, v6, #2
++T       mov         v7, v7, lsr #2
++        subs        a2, a2, #1
++A       orr         v7, v5, v7, lsr #2
++T       orr         v7, v5, v7
++        bhi         11b
++
++        ldrd        v3, v4, [sp, #16*4]
++        ldr         a2, [sp]
++        add         ip, sp, #16*4
++        subs        a1, a1, #1
++        add         a3, a3, v3
++        add         a4, a4, v4
++        bhi         1b
++        mov         a1, v7, lsr v6
++        pop         {a2-a4,v1-v8,pc}
++
++20:     teq         v8, #3
++        bne         10b
++
++        teq         v1, v3
++        it          eq
++        teqeq       v2, v4
++        bne         40f
++        teq         v1, v2
++        bne         30f
++
++        ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      v5, v1, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         25f
++        ssub16      ip, v4, v2
++        ssub16      v5, v2, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        beq         11b
++        @ drop through
++25:     ssub16      ip, v4, v1
++        ssub16      v5, v1, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         10b
++        ssub16      ip, v3, v2
++        ssub16      v5, v2, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        b           10b
++
++30:     ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        ssub16      ip, v3, v1
++        ssub16      v5, v1, v3
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        bne         10b
++        ssub16      ip, v4, v2
++        ssub16      v5, v2, v4
++        sel         v5, v5, ip
++        ands        v5, v5, lr
++        b           10b
++
++40:     teq         v1, v4
++        ite         eq
++        teqeq       v2, v3
++        bne         10b
++
++        ldrd        v1, v2, [a3]    @ curr->mv
++        ldrd        v3, v4, [a4]    @ neigh->mv
++        ldr         lr, =0xFFFCFFFC
++        b           25b
++
++90:
++        mov         v5, #1<<30
++        b           11b
++endfunc
++
++
++#endif
++
++
++@ =============================================================================
++@
++@ 10 bit
++
++function hevc_loop_filter_luma_body_10
++        m_filter_luma 10, q11, q15
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        b        .Lh_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r10, [sp, #32]
++.Lh_loop_luma_common_10:
++        m_filter_h_luma_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
++        hevc_loop_filter_luma_start
++        sub      r4, r0, #8
++        b        .Lv_loop_luma_common_10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
++        cmp      r3, #0
++        it       eq
++        bxeq     lr
++        push     {r4-r10,lr}            @ 32 bytes
++        ldr      r4, [sp, #36]
++        ldr      r10, [sp, #32]
++
++.Lv_loop_luma_common_10:
++        m_filter_v_luma_16 10
++endfunc
++
++function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
++        m_filter_h_uv_16 10
++endfunc
++
++function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
++        m_filter_v_uv2_16 10
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+new file mode 100644
+index 0000000000..7ed5c7dc52
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
+@@ -0,0 +1,184 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++/* uses registers q8 - q13 for temp values */
++.macro tr4_luma_shift shift
++        vaddl.s16   q8, d28, d30    // c0 = src0 + src2
++        vaddl.s16   q9, d30, d31    // c1 = src2 + src3
++        vsubl.s16   q10, d28, d31   // c2 = src0 - src3
++        vaddl.s16   q11, d28, d31   // src0 + src3
++
++        vmul.i32    q12, q8, d1[0]  // 29 * c0
++        vmul.i32    q13, q10, d2[0] // 55 * c2
++        vmul.i32    q8, q8, d2[0]   // 55 * c0
++        vmull.s16   q14, d29, d0[0] // c3 = 74 * src1
++
++        vsubw.s16   q11, q11, d30   // src0 - src2 + src3
++        vmla.i32    q12, q9, d2[0]  // 29 * c0 + 55 * c1
++        vmls.i32    q13, q9, d1[0]  // 55 * c2 - 29 * c1
++        vmla.i32    q8, q10, d1[0]  // 55 * c0 + 29 * c2
++
++        vmul.i32    q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
++        vadd.i32    q12, q12, q14   // dst0 = 29 * c0 + 55 * c1 + c3
++        vadd.i32    q13, q13, q14   // dst1 = 55 * c2 - 29 * c1 + c3
++        vsub.i32    q8, q8, q14     // dst3 = 55 * c0 + 29 * c2 - c3
++
++        vqrshrn.s32 d28, q12, \shift
++        vqrshrn.s32 d29, q13, \shift
++        vqrshrn.s32 d30, q11, \shift
++        vqrshrn.s32 d31, q8, \shift
++.endm
++
++/* uses registers q8 - q11 for temp values */
++.macro tr4_shift shift
++        vmull.s16   q9, d29, d0[0]   // 83 * src1
++        vmull.s16   q8, d29, d0[1]   // 36 * src1
++        vshll.s16   q14, d28, #6     // 64 * src0
++        vshll.s16   q10, d30, #6     // 64 * src2
++        vmlal.s16   q9, d31, d0[1]   // 83 * src1 + 36 * src3  o0
++        vmlsl.s16   q8, d31, d0[0]   // 36 * src1 - 83 * src3  o1
++        vadd.s32    q11, q14, q10    // 64 * (src0 + src2)     e0
++        vsub.s32    q10, q14, q10    // 64 * (src0 - src2)     e1
++        vadd.s32    q14, q11, q9     // e0 + o0
++        vadd.s32    q15, q10, q8     // e1 + o1
++        vsub.s32    q8, q10, q8      // e1 - o1
++        vsub.s32    q9, q11, q9      // e0 - o0
++
++        vqrshrn.s32 d28, q14, \shift
++        vqrshrn.s32 d29, q15, \shift
++        vqrshrn.s32 d30, q8, \shift
++        vqrshrn.s32 d31, q9, \shift
++.endm
++
++.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7,                         \
++                   tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
++                   tmp1, /* Q reg which doesn't alias with d7 or d0     */ \
++                   shift, I1, I2, I3
++
++        vmull.s16  q4, \d1, d1[1]        // 89 * src1
++        \I1
++        vmull.s16  q5, \d1, d1[0]        // 75 * src1
++        \I2
++        vmull.s16  q6, \d1, d1[3]        // 50 * src1
++        \I3
++        vmull.s16  q7, \d1, d1[2]        // 18 * src1
++        vmlal.s16  q4, \d3, d1[0]        // 75 * src3
++        vmlsl.s16  q5, \d3, d1[2]        //-18 * src3
++        vmlsl.s16  q6, \d3, d1[1]        //-89 * src3
++        vmlsl.s16  q7, \d3, d1[3]        //-50 * src3
++
++          // tr4
++          vmull.s16  q1, \d2, d0[0]      // 83 * src(1*2)
++          vmull.s16  q2, \d2, d0[1]      // 36 * src(1*2)
++
++        vmlal.s16  q4, \d5, d1[3]        // 50 * src5
++        vmlsl.s16  q5, \d5, d1[1]        //-89 * src5
++        vmlal.s16  q6, \d5, d1[2]        // 18 * src5
++        vmlal.s16  q7, \d5, d1[0]        // 75 * src5
++
++          vshll.s16  q3, \d0, #6         // 64 * src(0*2)
++          vshll.s16  \tmp0, \d4, #6      // 64 * src(2*2)
++          vmlal.s16  q1, \d6, d0[1]      // 83 * src(1*2) + 36 * src(3*2)  o0
++          vmlsl.s16  q2, \d6, d0[0]      // 36 * src(1*2) - 83 * src(3*2)  o1
++          vadd.i32   \tmp1, q3, \tmp0    // 64 * (src(0*2) + src(2*2))     e0
++          vsub.i32   \tmp0, q3, \tmp0    // 64 * (src(0*2) - src(2*2))     e1
++
++        vmlal.s16  q4, \d7, d1[2]        // 18 * src7
++        vmlsl.s16  q5, \d7, d1[3]        //-50 * src7
++        vmlal.s16  q6, \d7, d1[0]        // 75 * src7
++        vmlsl.s16  q7, \d7, d1[1]        //-89 * src7
++
++          vsub.i32   q3, \tmp1, q1       // e0 - o0
++          vadd.i32   \tmp1, \tmp1, q1    // e0 + o0
++          vadd.i32   q1, \tmp0, q2       // e1 + o1
++          vsub.i32   q2, \tmp0, q2       // e1 - o1
++
++        vadd.i32   \tmp0, \tmp1, q4      // e_8[0] + o_8[0], dst[0]
++        vsub.i32   q4, \tmp1, q4         // e_8[0] - o_8[0], dst[7]
++        vsub.i32   \tmp1, q3, q7         // e_8[3] - o_8[3], dst[4]
++        vadd.i32   q7, q3, q7            // e_8[3] + o_8[3], dst[3]
++        vadd.i32   q3, q1, q5            // e_8[1] + o_8[1], dst[1]
++        vsub.i32   q5, q1, q5            // e_8[1] - o_8[1], dst[6]
++        vsub.i32   q1, q2, q6            // e_8[2] - o_8[2], dst[5]
++        vadd.i32   q6, q2, q6            // e_8[2] + o_8[2], dst[2]
++        vqrshrn.s32   \d0, \tmp0, #\shift
++        vqrshrn.s32   \d4, \tmp1, #\shift
++        vqrshrn.s32   \d1, q3, #\shift
++        vqrshrn.s32   \d5, q1, #\shift
++        vqrshrn.s32   \d2, q6, #\shift
++        vqrshrn.s32   \d6, q5, #\shift
++        vqrshrn.s32   \d3, q7, #\shift
++        vqrshrn.s32   \d7, q4, #\shift
++.endm
++
++.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
++        vld1.16     {\d0}, [r0 :64], r3
++        vld1.16     {\d1}, [r2 :64], r3
++        vld1.16     {\d2}, [r0 :64], r3
++        vld1.16     {\d3}, [r2 :64], r3
++        vld1.16     {\d4}, [r0 :64], r3
++        vld1.16     {\d5}, [r2 :64], r3
++        vld1.16     {\d6}, [r0 :64], r3
++        vld1.16     {\d7}, [r2 :64], r3
++
++        tr8_process \
++            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++            \q01, \q23, 7, "\I1", "\I2", "\I3"
++.endm
++
++.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
++        tr8_process \
++            \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
++            \q01, \q23, \shift
++
++        vzip.16    \d0, \d4
++        vzip.16    \d1, \d5
++        vzip.16    \d2, \d6
++        vzip.16    \d3, \d7
++        vst4.16    {\d0-\d3}, [r0 :128], r3
++        vst4.16    {\d4-\d7}, [r2 :128], r3
++.endm
++
++#define BIT_DEPTH 8
++#include "rpi_hevc_idct_fn_neon.S"
++
++.text
++
++.align 4
++tr4f:
++.word 0x00240053  // 36 and d1[0] = 83
++.word 0x00000000
++tr8f:
++.word 0x0059004b  // 89, d0[0] = 75
++.word 0x00320012  // 50, d0[2] = 18
++tr16:
++.word 0x005a0057  // 90, d2[0] = 87
++.word 0x00500046  // 80, d2[2] = 70
++.word 0x0039002b  // 57, d2[0] = 43
++.word 0x00190009  // 25, d2[2] = 9
++
++#undef BIT_DEPTH
++#define BIT_DEPTH 10
++#include "rpi_hevc_idct_fn_neon.S"
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+new file mode 100644
+index 0000000000..109fa98c29
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
+@@ -0,0 +1,32 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++
++av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        ff_hevcdsp_rpi_init_neon(c, bit_depth);
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+new file mode 100644
+index 0000000000..9294ab8010
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
+@@ -0,0 +1,467 @@
++/*
++ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/attributes.h"
++#include "libavutil/arm/cpu.h"
++#include "libavcodec/rpi_hevcdsp.h"
++#include "rpi_hevcdsp_arm.h"
++#include "libavcodec/avcodec.h"
++#include "libavcodec/bit_depth_template.c"
++
++// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
++// have been removed from head as we never use them.
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
++                             unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                             uint8_t * _pix_l);
++void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
++                             unsigned int no_f);
++void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                             uint8_t * src_l,
++                             unsigned int no_f);
++
++void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
++
++void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
++void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
++void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
++
++void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                     ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
++                                       ptrdiff_t stride);
++
++void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++
++void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_v);
++void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride, int dc_u);
++void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
++                                       ptrdiff_t stride);
++void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
++
++void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
++void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height);
++
++
++uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++                                                const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                                int in_inc0, int in_inc1);
++void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
++
++
++static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
++}
++static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
++}
++
++static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
++    ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++}
++
++#if SAO_FILTER_N == 6
++static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
++    ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                int16_t *sao_offset_val, int sao_left_class, int width, int height)
++{
++    ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
++    ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
++}
++
++static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
++                                  int eo, int width, int height)
++{
++    ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
++    ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
++}
++
++static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
++    ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
++                                sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
++}
++#endif
++
++
++
++#if RPI_HEVC_SAO_BUF_STRIDE != 160
++#error SAO edge src stride not 160 - value used in .S
++#endif
++
++av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
++{
++    if (bit_depth == 8) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_8;
++        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_8;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_8;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_8;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_8;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_8;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_8;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_8;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_8;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_8;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_8;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_8;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_8;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_8;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_8;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_8;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_8;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_8;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_8;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_8;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_8;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_8;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_8;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_8;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_8;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_8;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_8;
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_8;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_8;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_8;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_8;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_8;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_8;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_8;
++#endif
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_8;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_8;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_8;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_8;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_8;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_8;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_8;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_8;
++#endif
++    }
++    else if (bit_depth == 10) {
++        c->hevc_v_loop_filter_luma     = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_v_loop_filter_luma_c   = ff_hevc_rpi_v_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma     = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma_c   = ff_hevc_rpi_h_loop_filter_luma_neon_10;
++        c->hevc_h_loop_filter_luma2    = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
++        c->hevc_v_loop_filter_luma2    = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
++        c->hevc_h_loop_filter_uv       = ff_hevc_rpi_h_loop_filter_uv_neon_10;
++        c->hevc_v_loop_filter_uv2      = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
++        c->idct[0]                     = ff_hevc_rpi_transform_4x4_neon_10;
++        c->idct[1]                     = ff_hevc_rpi_transform_8x8_neon_10;
++        c->idct_dc[0]                  = ff_hevc_rpi_idct_4x4_dc_neon_10;
++        c->idct_dc[1]                  = ff_hevc_rpi_idct_8x8_dc_neon_10;
++        c->idct_dc[2]                  = ff_hevc_rpi_idct_16x16_dc_neon_10;
++        c->idct_dc[3]                  = ff_hevc_rpi_idct_32x32_dc_neon_10;
++        c->add_residual[0]             = ff_hevc_rpi_add_residual_4x4_neon_10;
++        c->add_residual[1]             = ff_hevc_rpi_add_residual_8x8_neon_10;
++        c->add_residual[2]             = ff_hevc_rpi_add_residual_16x16_neon_10;
++        c->add_residual[3]             = ff_hevc_rpi_add_residual_32x32_neon_10;
++        c->add_residual_dc[0]          = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
++        c->add_residual_dc[1]          = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
++        c->add_residual_dc[2]          = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
++        c->add_residual_dc[3]          = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
++        c->add_residual_u[0]           = ff_hevc_rpi_add_residual_4x4_u_neon_10;
++        c->add_residual_u[1]           = ff_hevc_rpi_add_residual_8x8_u_neon_10;
++        c->add_residual_u[2]           = ff_hevc_rpi_add_residual_16x16_u_neon_10;
++        c->add_residual_v[0]           = ff_hevc_rpi_add_residual_4x4_v_neon_10;
++        c->add_residual_v[1]           = ff_hevc_rpi_add_residual_8x8_v_neon_10;
++        c->add_residual_v[2]           = ff_hevc_rpi_add_residual_16x16_v_neon_10;
++        c->add_residual_c[0]           = ff_hevc_rpi_add_residual_4x4_c_neon_10;
++        c->add_residual_c[1]           = ff_hevc_rpi_add_residual_8x8_c_neon_10;
++        c->add_residual_c[2]           = ff_hevc_rpi_add_residual_16x16_c_neon_10;
++        c->add_residual_dc_c[0]        = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
++        c->add_residual_dc_c[1]        = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
++        c->add_residual_dc_c[2]        = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
++        c->transform_4x4_luma          = ff_hevc_rpi_transform_luma_4x4_neon_10;
++        c->sao_band_filter[0]          = ff_hevc_rpi_sao_band_8_neon_10;
++        c->sao_band_filter[1]          = ff_hevc_rpi_sao_band_16_neon_10;
++        c->sao_band_filter[2]          = ff_hevc_rpi_sao_band_32_neon_10;
++        c->sao_band_filter[3]          = ff_hevc_rpi_sao_band_48_neon_10;
++        c->sao_band_filter[4]          = ff_hevc_rpi_sao_band_64_neon_10;
++
++        c->sao_edge_filter[0]          = ff_hevc_rpi_sao_edge_8_neon_10;
++        c->sao_edge_filter[1]          = ff_hevc_rpi_sao_edge_16_neon_10;
++        c->sao_edge_filter[2]          = ff_hevc_rpi_sao_edge_32_neon_10;
++        c->sao_edge_filter[3]          = ff_hevc_rpi_sao_edge_48_neon_10;
++        c->sao_edge_filter[4]          = ff_hevc_rpi_sao_edge_64_neon_10;
++#if SAO_FILTER_N == 6
++        c->sao_band_filter[5]          = ff_hevc_rpi_sao_band_24_neon_10;
++        c->sao_edge_filter[5]          = ff_hevc_rpi_sao_edge_24_neon_10;
++#endif
++        c->sao_band_filter_c[0]        = ff_hevc_rpi_sao_band_c_8_neon_10;
++        c->sao_band_filter_c[1]        = ff_hevc_rpi_sao_band_c_16_neon_10;
++        c->sao_band_filter_c[2]        = ff_hevc_rpi_sao_band_c_32_neon_10;
++
++        c->sao_edge_filter_c[0]        = ff_hevc_rpi_sao_edge_c_8_neon_10;
++        c->sao_edge_filter_c[1]        = ff_hevc_rpi_sao_edge_c_16_neon_10;
++        c->sao_edge_filter_c[2]        = ff_hevc_rpi_sao_edge_c_32_neon_10;
++
++#if SAO_FILTER_N == 6
++        c->sao_band_filter_c[5]        = ff_hevc_rpi_sao_band_c_24_neon_10;
++        c->sao_edge_filter_c[5]        = ff_hevc_rpi_sao_edge_c_24_neon_10;
++#endif
++    }
++
++    assert(offsetof(HEVCRpiMvField, mv) == 0);
++    assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
++    assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
++    c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
++    c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
++}
+diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+new file mode 100644
+index 0000000000..93876d14c0
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
+@@ -0,0 +1,620 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++#define BIT_DEPTH 10
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ add_residual4x4(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1]
++        lsl         r2, #1
++        vld1.16     {d0}, [r0 :64], r2
++        vld1.16     {d1}, [ip :64], r2
++        vld1.16     {d2}, [r0 :64]
++        vld1.16     {d3}, [ip :64]
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q11
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0 :64], r2
++        vst1.16     {d1}, [ip :64], r2
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [ip :64]
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r1
++        vdup.16     q15, r2
++        lsl         r1, #1
++        vld1.16     {d0}, [r0 :64], r1
++        vld1.16     {d1}, [ip :64], r1
++        vld1.16     {d2}, [r0 :64]
++        vld1.16     {d3}, [ip :64]
++        sub         r0, r1
++        vqadd.s16   q0,  q15
++        sub         ip, r1
++        vqadd.s16   q1,  q15
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        vmax.s16    q0,  q0,  q8
++        vmax.s16    q1,  q1,  q8
++        vmin.s16    q0,  q0,  q9
++        vmin.s16    q1,  q1,  q9
++        vst1.16     {d0}, [r0 :64], r1
++        vst1.16     {d1}, [ip :64], r1
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [ip :64]
++        bx          lr
++
++endfunc
++
++
++@ add_residual8x8(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
++        mov         r3, #8
++        vmov.i64    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vldm        r1!, {q10-q13}
++        vld1.16     {q0}, [r0 :128], r2
++        vld1.16     {q1}, [ip :128], r2
++        vld1.16     {q2}, [r0 :128]
++        vld1.16     {q3}, [ip :128]
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q11
++        subs        r3, #4
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0}, [r0 :128], r2
++        vst1.16     {q1}, [ip :128], r2
++        vst1.16     {q2}, [r0 :128], r2
++        vst1.16     {q3}, [ip :128], r2
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual4x4_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #4
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual8x8_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r2
++        mov         r3, #8
++9:
++        vmov.i16    q8,  #0
++        add         ip, r0, r1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r1, #1
++1:
++        vld1.16     {q0}, [r0 :128], r1
++        vld1.16     {q1}, [ip :128], r1
++        vld1.16     {q2}, [r0 :128]
++        vld1.16     {q3}, [ip :128]
++        sub         r0, r1
++        vqadd.s16   q0,  q15
++        sub         ip, r1
++        vqadd.s16   q1,  q15
++        subs        r3, #4
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0}, [r0 :128], r1
++        vst1.16     {q1}, [ip :128], r1
++        vst1.16     {q2}, [r0 :128], r1
++        vst1.16     {q3}, [ip :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ add_residual16x16(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
++        add         ip, r0, r2
++        vmov.i16    q8,  #0
++        lsl         r2, #1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        mov         r3, #16
++1:
++        vldm        r1!, {q10-q13}
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0 :128]
++        subs        r3, #2
++        vld1.16     {q2, q3}, [ip :128]
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q11
++        vqadd.s16   q2,  q12
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0 :128], r2
++        vst1.16     {q2, q3}, [ip :128], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual8x8_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #8
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual16x16_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
++        vdup.i16    q15, r2
++        mov         r3, #16
++9:
++        vmov.i16    q8,  #0
++        add         ip, r0, r1
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r1, #1
++1:
++        @ For RPI Sand we could guarantee :256 but not for general
++        @ non-RPI allocation. :128 is as good as we can claim
++        vld1.16     {q0, q1}, [r0 :128]
++        subs        r3, #2
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q15
++        vld1.16     {q2, q3}, [ip :128]
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0, q1}, [r0 :128], r1
++        vst1.16     {q2, q3}, [ip :128], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++
++@ add_residual32x32(
++@  uint16_t *_dst,    [r0]
++@  int16_t *res,      [r1]
++@  ptrdiff_t stride)  [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        mov         r3, #32
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vldm        r1!, {q10-q13}
++        vldm        r0,  {q0-q3}
++        vqadd.s16   q0,  q10
++          pldw        [lr]
++        vqadd.s16   q1,  q11
++          add         lr, r2
++        vqadd.s16   q2,  q12
++        subs        r3, #1
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0-q1}, [r0], r2
++        vst1.16     {q2-q3}, [ip], r2
++        bne         1b
++        pop         {pc}
++
++endfunc
++
++@ add_residual16x16_dc_c(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc_uv)         [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
++        mov         r3, #16
++        vdup.32     q15, r2
++        b           9f
++endfunc
++
++@ add_residual32x32_dc(
++@  uint16_t *_dst,    [r0]
++@  ptrdiff_t stride,  [r1]
++@  int dc)            [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r2
++        mov         r3, #32
++9:
++        vmov.i16    q8,  #0
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vldm        r0,  {q0-q3}
++        vqadd.s16   q0,  q15
++        subs        r3, #1
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst1.16     {q0-q1}, [r0], r1
++        vst1.16     {q2-q3}, [ip], r1
++        bne         1b
++        bx          lr
++
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1 :256]
++        lsl         r2, #1
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++        sub         r0, r2
++        vmov.i16    q8,  #0
++        sub         ip, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        mov         r3, #8
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        subs        r3, #2
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q15
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_u(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        vdup.16     q15, r3
++        mov         r3, #16
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q10
++          pldw        [lr]
++        vqadd.s16   q1,  q15
++          add         lr, r2
++        vqadd.s16   q2,  q11
++        subs        r3, #1
++        vqadd.s16   q3,  q15
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld1.16     {q10, q11}, [r1 :256]
++        lsl         r2, #1
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++        sub         r0, r2
++        vmov.i16    q8,  #0
++        sub         ip, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
++        vdup.16     q15, r3
++        mov         r3, #8
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        subs        r3, #2
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q15
++        vqadd.s16   q1,  q10
++        vqadd.s16   q2,  q15
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        bx          lr
++endfunc
++
++@ add_residual16x16_v(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc)               [r3]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        vdup.16     q15, r3
++        mov         r3, #16
++        vmov.i16    q8,  #0
++        add         lr, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vqadd.s16   q0,  q15
++          pldw        [lr]
++        vqadd.s16   q1,  q10
++          add         lr, r2
++        vqadd.s16   q2,  q15
++        subs        r3, #1
++        vqadd.s16   q3,  q11
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
++        vmov.i16    q8,  #0
++        add         ip, r0, r2
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        lsl         r2, #1
++        vldm        r1, {q10-q13}
++        vld2.16     {d0, d2}, [r0 :128], r2
++        vld2.16     {d1, d3}, [ip :128], r2
++        vld2.16     {d4, d6}, [r0 :128]
++        vld2.16     {d5, d7}, [ip :128]
++
++        sub         r0, r2
++        vqadd.s16   q0,  q10
++        sub         ip, r2
++        vqadd.s16   q1,  q12
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++
++        vst2.16     {d0, d2}, [r0 :128], r2
++        vst2.16     {d1, d3}, [ip :128], r2
++        vst2.16     {d4, d6}, [r0 :128]
++        vst2.16     {d5, d7}, [ip :128]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
++        push        {lr}
++        add         ip, r0, r2
++        lsl         r2, #1
++        vmov.i16    q8,  #0
++        add         r3, r1, #(8*8*2)  @ Offset to V
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        mov         lr, #8
++1:
++        vld1.16     {q10, q11}, [r1 :256]!
++        subs        lr, #2
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q12, q13}, [r3 :256]!
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q12
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {pc}
++endfunc
++
++@ add_residual16x16_c(
++@   uint16_t *_dst,       [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
++        push        {r4, lr}
++        vmov.i16    q8,  #0
++        add         r3,  r1, #(16*16*2)  @ Offset to V
++        vmov.i16    q9,  #(1 << BIT_DEPTH) - 1
++        add         ip, r0, #32
++        add         r4, r0, r2
++        mov         lr, #16
++1:
++        vld2.16     {q0, q1}, [r0 :256]
++        vld2.16     {q2, q3}, [ip :256]
++        vld1.16     {q10, q11}, [r1 :256]!
++        vld1.16     {q12, q13}, [r3 :256]!
++        vqadd.s16   q0,  q10
++          pldw        [r4]
++        vqadd.s16   q1,  q12
++          add         r4, r2
++        vqadd.s16   q2,  q11
++        subs        lr, #1
++        vqadd.s16   q3,  q13
++        clip16_4    q0, q1, q2, q3, q8, q9
++        vst2.16     {q0, q1}, [r0 :256], r2
++        vst2.16     {q2, q3}, [ip :256], r2
++        bne         1b
++        pop         {r4,pc}
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+new file mode 100644
+index 0000000000..d9a1d7d98c
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
+@@ -0,0 +1,741 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++ .arch_extension mp @ enable PLDW
++
++@ General notes:
++@
++@ Residual is generally only guaranteed to be clipped to 16 bits.
++@ This means that we do need to do vmovl, vqadd, vqmovun
++@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
++@ with this).
++@
++@ There is an exception for the DC case because its transform is guaranteed
++@ to be small enough that overflow cannot occur during the first add.
++
++@ ============================================================================
++@ Y add
++
++function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]
++        lsl         r2, #1
++        vld1.32     d4[0], [r0], r2
++        rsb         r3, r2, #0
++        vld1.32     d4[1], [ip], r2
++        vld1.32     d5[0], [r0], r3
++        vld1.32     d5[1], [ip], r3
++        vmovl.u8    q8, d4
++        vmovl.u8    q9, d5
++        vqadd.s16   q0, q8
++        vqadd.s16   q1, q9
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r2
++        vst1.32     d0[1], [ip], r2
++        vst1.32     d1[0], [r0]
++        vst1.32     d1[1], [ip]
++        bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
++        push        {r4, lr}
++        vld1.16     {q0, q1}, [r1]!
++        add         ip, r0, r2
++        vld1.8      {d6}, [r0]
++        add         r4, r0, r2, lsl #1
++        vld1.8      {d7}, [ip]
++        add         lr, ip, r2, lsl #1
++        lsl         r2, #1
++        mov         r3, #8-2
++        vmovl.u8    q2, d6
++        vmovl.u8    q3, d7
++        vqadd.s16   q2, q0
++        vqadd.s16   q3, q1
++1:
++          vld1.16     {q0, q1}, [r1]!
++        subs        r3, #2
++        vqmovun.s16 d4, q2
++        vqmovun.s16 d5, q3
++          vld1.8      {d6}, [r4], r2
++          vld1.8      {d7}, [lr], r2
++        vst1.8      {d4}, [r0], r2
++        vst1.8      {d5}, [ip], r2
++          vmovl.u8    q2, d6
++            pldw        [r4]
++          vmovl.u8    q3, d7
++          vqadd.s16   q2, q0
++          vqadd.s16   q3, q1
++        bne         1b
++
++          vqmovun.s16 d4, q2
++          vqmovun.s16 d5, q3
++          vst1.8      {d4}, [r0]
++          vst1.8      {d5}, [ip]
++          pop         {r4, pc}
++endfunc
++
++function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
++        vld1.16     {q0, q1}, [r1]!
++        add         ip, r0, r2
++        vld1.8      {q3}, [r0]
++        mov         r3, #16-1
++        vmovl.u8    q2, d6
++        vmovl.u8    q3, d7
++        vqadd.s16   q2, q0
++        vqadd.s16   q3, q1
++1:
++          vld1.16     {q0, q1}, [r1]!
++        subs        r3, #1
++        vqmovun.s16 d4, q2
++        vqmovun.s16 d5, q3
++          vld1.8      {q3}, [ip], r2
++        vst1.8      {q2}, [r0], r2
++          vmovl.u8    q2, d6
++            pldw        [ip]
++          vmovl.u8    q3, d7
++          vqadd.s16   q2, q0
++          vqadd.s16   q3, q1
++        bne         1b
++
++          vqmovun.s16 d4, q2
++          vqmovun.s16 d5, q3
++          vst1.8      {q2}, [r0]
++          bx          lr
++endfunc
++
++function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
++        vldm        r1!, {q0-q3}
++        vld1.8      {q8, q9}, [r0]
++        add         ip, r0, r2
++        vmovl.u8    q10, d16
++        mov         r3, #32-1
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vqadd.s16   q10, q0
++        vqadd.s16   q11, q1
++        vqadd.s16   q12, q2
++        vqadd.s16   q13, q3
++1:
++          vldm        r1!, {q0-q3}
++        vqmovun.s16 d20, q10
++        vqmovun.s16 d21, q11
++        vqmovun.s16 d22, q12
++        vqmovun.s16 d23, q13
++          vld1.8      {q8, q9}, [ip], r2
++        subs        r3, #1
++        vst1.8      {q10, q11}, [r0], r2
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q11, d17
++          vmovl.u8    q12, d18
++          vmovl.u8    q13, d19
++          vqadd.s16   q10, q0
++          vqadd.s16   q11, q1
++          vqadd.s16   q12, q2
++          vqadd.s16   q13, q3
++        bne     1b
++
++          vqmovun.s16 d20, q10
++          vqmovun.s16 d21, q11
++          vqmovun.s16 d22, q12
++          vqmovun.s16 d23, q13
++          vst1.8      {q10, q11}, [r0]
++          bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
++        add         ip, r0, r1
++        vdup.16     q15, r2
++        lsl         r1, #1
++        vld1.32     d4[0], [r0], r1
++        rsb         r3, r1, #0
++        vld1.32     d4[1], [ip], r1
++        vld1.32     d5[0], [r0], r3
++        vld1.32     d5[1], [ip], r3
++        vaddw.u8    q0, q15, d4
++        vaddw.u8    q1, q15, d5
++        vqmovun.s16 d0, q0
++        vqmovun.s16 d1, q1
++        vst1.32     d0[0], [r0], r1
++        vst1.32     d0[1], [ip], r1
++        vst1.32     d1[0], [r0]
++        vst1.32     d1[1], [ip]
++        bx          lr
++endfunc
++
++@ ============================================================================
++@ DC Y or C add
++
++@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
++        mov         r3,  #4-2
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3, #8-2
++1:      vld1.8      d16, [r0]
++        add         ip, r0, r1
++        push        {r4, lr}
++        vld1.8      d17, [ip]
++        add         r4, r0, r1, lsl #1
++        vaddw.u8    q0, q15, d16
++        lsl         r1, #1
++        vaddw.u8    q1, q15, d17
++        add         lr, ip, r1
++1:
++          vld1.8      {d16}, [r4], r1
++          vld1.8      {d17}, [lr], r1
++        subs        r3, #2
++        vqmovun.s16 d4, q0
++        vqmovun.s16 d5, q1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++        vst1.8      {d4}, [r0], r1
++        vst1.8      {d5}, [ip], r1
++        bne         1b
++
++          vqmovun.s16 d4, q0
++          vqmovun.s16 d5, q1
++          vst1.8      {d4}, [r0]
++          vst1.8      {d5}, [ip]
++          pop         {r4, pc}
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
++        mov         r3,  #8-1
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3,  #16-1
++1:      vld1.8      {q8}, [r0]
++        add         ip, r0, r1
++        vaddw.u8    q0, q15, d16
++        vaddw.u8    q1, q15, d17
++1:
++          vld1.8      {q8}, [ip], r1
++        subs        r3, #1
++        vqmovun.s16 d4, q0
++        vqmovun.s16 d5, q1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++        vst1.8      {q2}, [r0], r1
++        bne         1b
++
++          vqmovun.s16 d4, q0
++          vqmovun.s16 d5, q1
++          vst1.8      {q2}, [r0]
++          bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
++        mov         r3,  #16-1
++        vdup.32     q15, r2
++        b           1f
++endfunc
++
++@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
++@   uint8_t * dst,              // [r0]
++@   unsigned int stride,        // [r1]
++@   int dc)                     // [r2]
++
++function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
++        vdup.16     q15, r2
++        mov         r3, #32-1
++1:      vld1.8      {q8, q9}, [r0]
++        add         ip, r0, r1
++        vaddw.u8    q0, q15, d16
++        vaddw.u8    q1, q15, d17
++        vaddw.u8    q2, q15, d18
++        vaddw.u8    q3, q15, d19
++1:
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d22, q2
++        vqmovun.s16 d23, q3
++          vld1.8      {q8, q9}, [ip], r1
++        subs        r3, #1
++          vaddw.u8    q0, q15, d16
++          vaddw.u8    q1, q15, d17
++          vaddw.u8    q2, q15, d18
++          vaddw.u8    q3, q15, d19
++        vst1.8      {q10, q11}, [r0], r1
++        bne     1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d22, q2
++          vqmovun.s16 d23, q3
++          vst1.8      {q10, q11}, [r0]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ U add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride,     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        vld1.8      {d17}, [ip :64], r2
++        vld1.8      {d18}, [r0 :64]
++        sub         r0, r2
++        vld1.8      {d19}, [ip :64]
++        sub         ip, r2
++        vdup.16     q2, r3
++        vdup.16     q3, r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        push        {r4, lr}
++        vld2.8      {d16, d17}, [r0 :128]
++        lsl         r2, #1
++        vld2.8      {d18, d19}, [ip :128]
++        mov         r3, #8-2
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         r4, r0, r2
++        vmovl.u8    q10, d16
++        add         lr, ip, r2
++        vmovl.u8    q11, d18
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d17
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d19
++1:
++        vqmovun.s16 d20,  q0
++        vqmovun.s16 d21,  q2
++          vld2.8      {d16, d17}, [r4 :128], r2
++        subs        r3, #2
++        vqmovun.s16 d22,  q1
++        vqmovun.s16 d23,  q3
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vld2.8      {d18, d19}, [lr :128], r2
++        vst2.8      {d22, d23}, [ip :128], r2
++          vld1.16     {q0, q1}, [r1 :256]!
++          vmovl.u8    q10, d16
++          vmovl.u8    q11, d18
++          vqadd.s16   q0,  q10
++          vaddw.u8    q2,  q15, d17
++          vqadd.s16   q1,  q11
++          vaddw.u8    q3,  q15, d19
++        bne         1b
++
++          vqmovun.s16 d20,  q0
++          vqmovun.s16 d21,  q2
++          vqmovun.s16 d22,  q1
++          vqmovun.s16 d23,  q3
++          vst2.8      {d20, d21}, [r0 :128]
++          vst2.8      {d22, d23}, [ip :128]
++          pop         {r4, pc}
++endfunc
++
++@ add_residual16x16_u(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++@   int dc_v)             [r3]
++
++function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld2.8      {q8, q9}, [r0 :256]
++        mov         r3, #16-1
++        vld1.16     {q0, q1}, [r1 :256]!
++        vmovl.u8    q11, d16
++        vmovl.u8    q12, d17
++        vqadd.s16   q0,  q11
++        vaddw.u8    q11, q15, d18
++        vqadd.s16   q1,  q12
++        vaddw.u8    q12, q15, d19
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        r3, #1
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d22, q11
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d23, q12
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {q10, q11}, [r0 :256], r2
++          vmovl.u8    q11, d16
++            pldw        [ip]
++          vmovl.u8    q12, d17
++          vqadd.s16   q0,  q11
++          vaddw.u8    q11, q15, d18
++          vqadd.s16   q1,  q12
++          vaddw.u8    q12, q15, d19
++        bne         1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d22, q11
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d23, q12
++          vst2.8      {q10, q11}, [r0 :256]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ V add
++
++@ add_residual4x4_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q2, q3}, [r1]
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        vld1.8      {d17}, [ip :64], r2
++        vld1.8      {d18}, [r0 :64]
++        sub         r0, r2
++        vld1.8      {d19}, [ip :64]
++        sub         ip, r2
++        vdup.16     q0, r3
++        vdup.16     q1, r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        push        {r4, lr}
++        vld2.8      {d16, d17}, [r0 :128]
++        lsl         r2, #1
++        vld2.8      {d18, d19}, [ip :128]
++        mov         r3, #8-2
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         r4, r0, r2
++        vmovl.u8    q10, d17
++        add         lr, ip, r2
++        vmovl.u8    q11, d19
++        vqadd.s16   q0,  q10
++        vaddw.u8    q2,  q15, d16
++        vqadd.s16   q1,  q11
++        vaddw.u8    q3,  q15, d18
++1:
++        vqmovun.s16 d20,  q2
++        vqmovun.s16 d21,  q0
++          vld2.8      {d16, d17}, [r4 :128], r2
++        subs        r3, #2
++        vqmovun.s16 d22,  q3
++        vqmovun.s16 d23,  q1
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vld2.8      {d18, d19}, [lr :128], r2
++        vst2.8      {d22, d23}, [ip :128], r2
++          vld1.16     {q0, q1}, [r1 :256]!
++          vmovl.u8    q10, d17
++          vmovl.u8    q11, d19
++          vqadd.s16   q0,  q10
++          vaddw.u8    q2,  q15, d16
++          vqadd.s16   q1,  q11
++          vaddw.u8    q3,  q15, d18
++        bne         1b
++
++          vqmovun.s16 d20,  q2
++          vqmovun.s16 d21,  q0
++          vqmovun.s16 d22,  q3
++          vqmovun.s16 d23,  q1
++          vst2.8      {d20, d21}, [r0 :128]
++          vst2.8      {d22, d23}, [ip :128]
++          pop         {r4, pc}
++endfunc
++
++@ add_residual16x16_v(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
++        vdup.16     q15, r3
++        add         ip, r0, r2
++        vld2.8      {q8, q9}, [r0 :256]
++        mov         r3, #16-1
++        vld1.16     {q0, q1}, [r1 :256]!
++        vmovl.u8    q11, d18
++        vmovl.u8    q12, d19
++        vqadd.s16   q0,  q11
++        vaddw.u8    q11, q15, d16
++        vqadd.s16   q1,  q12
++        vaddw.u8    q12, q15, d17
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        r3, #1
++        vqmovun.s16 d20, q11
++        vqmovun.s16 d22, q0
++        vqmovun.s16 d21, q12
++        vqmovun.s16 d23, q1
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {q10, q11}, [r0 :256], r2
++          vmovl.u8    q11, d18
++            pldw        [ip]
++          vmovl.u8    q12, d19
++          vqadd.s16   q0,  q11
++          vaddw.u8    q11, q15, d16
++          vqadd.s16   q1,  q12
++          vaddw.u8    q12, q15, d17
++        bne         1b
++
++          vqmovun.s16 d20, q11
++          vqmovun.s16 d22, q0
++          vqmovun.s16 d21, q12
++          vqmovun.s16 d23, q1
++          vst2.8      {q10, q11}, [r0 :256]
++          bx          lr
++endfunc
++
++@ ============================================================================
++@ U & V add
++
++@ add_residual4x4_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
++        add         ip, r0, r2
++        vld1.16     {q0, q1}, [r1]!       @ all of U
++        lsl         r2, #1
++        vld1.8      {d16}, [r0 :64], r2
++        rsb         r3, r2, #0
++        vld1.8      {d17}, [ip :64], r2
++        vld1.16     {q2, q3}, [r1]        @ all of V
++        vld1.8      {d18}, [r0 :64], r3
++        vld1.8      {d19}, [ip :64], r3
++        vmovl.u8    q10, d16
++        vmovl.u8    q11, d17
++        vmovl.u8    q12, d18
++        vmovl.u8    q13, d19
++        vzip.16     q0, q2
++        vzip.16     q1, q3
++        vqadd.s16   q0,  q10
++        vqadd.s16   q2,  q11
++        vqadd.s16   q1,  q12
++        vqadd.s16   q3,  q13
++        vqmovun.s16 d0,  q0
++        vqmovun.s16 d1,  q2
++        vqmovun.s16 d2,  q1
++        vqmovun.s16 d3,  q3
++        vst1.8      {d0}, [r0 :64], r2
++        vst1.8      {d1}, [ip :64], r2
++        vst1.8      {d2}, [r0 :64]
++        vst1.8      {d3}, [ip :64]
++        bx          lr
++endfunc
++
++@ add_residual8x8_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
++        vld2.8      {d16, d17}, [r0 :128]
++        add         r3, r1, #(8*8*2)  @ Offset to V
++        vld1.16     {q0}, [r1 :128]!
++        add         ip, r0, r2
++        vld1.16     {q1}, [r3 :128]!
++        vmovl.u8    q10, d16
++        push        {lr}
++        vmovl.u8    q8,  d17
++        mov         lr, #8-1
++        vqadd.s16   q10, q0
++        vqadd.s16   q1,  q8
++1:
++          vld2.8      {d16, d17}, [ip :128], r2
++        subs        lr, #1
++          vld1.16     {q0}, [r1 :128]!
++        vqmovun.s16 d20, q10
++        vqmovun.s16 d21, q1
++          vld1.16     {q1}, [r3 :128]!
++        vst2.8      {d20, d21}, [r0 :128], r2
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q8,  d17
++          vqadd.s16   q10, q0
++          vqadd.s16   q1,  q8
++        bne         1b
++
++          vqmovun.s16 d20, q10
++          vqmovun.s16 d21, q1
++          vst2.8      {d20, d21}, [r0 :128]
++          pop         {pc}
++endfunc
++
++@ add_residual16x16_c(
++@   uint8_t *_dst,        [r0]
++@   const int16_t *res,   [r1]
++@   ptrdiff_t stride)     [r2]
++
++function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
++        vld2.8      {q8, q9}, [r0 :256]
++        add         r3, r1, #(16*16*2)  @ Offset to V
++        vld1.16     {q0, q1}, [r1 :256]!
++        add         ip, r0, r2
++        vld1.16     {q2, q3}, [r3 :256]!
++        vmovl.u8    q10, d16
++        push        {lr}
++        vmovl.u8    q8,  d17
++        mov         lr, #16-1
++        vmovl.u8    q11, d18
++        vmovl.u8    q9,  d19
++        vqadd.s16   q0,  q10
++        vqadd.s16   q1,  q8
++        vqadd.s16   q2,  q11
++        vqadd.s16   q3,  q9
++1:
++          vld2.8      {q8, q9}, [ip :256], r2
++        subs        lr, #1
++        vqmovun.s16 d20, q0
++        vqmovun.s16 d22, q2
++        vqmovun.s16 d21, q1
++        vqmovun.s16 d23, q3
++          vld1.16     {q0, q1}, [r1 :256]!
++        vst2.8      {d20-d23}, [r0 :256], r2
++          vld1.16     {q2, q3}, [r3 :256]!
++          vmovl.u8    q10, d16
++            pldw        [ip]
++          vmovl.u8    q8,  d17
++          vmovl.u8    q11, d18
++          vmovl.u8    q9,  d19
++          vqadd.s16   q0,  q10
++          vqadd.s16   q1,  q8
++          vqadd.s16   q2,  q11
++          vqadd.s16   q3,  q9
++        bne         1b
++
++          vqmovun.s16 d20, q0
++          vqmovun.s16 d22, q2
++          vqmovun.s16 d21, q1
++          vqmovun.s16 d23, q3
++          vst2.8      {d20-d23}, [r0 :256]
++          pop         {pc}
++endfunc
++
++@ 32x32 chroma never occurs so NIF
++
++@ ============================================================================
+diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+new file mode 100644
+index 0000000000..b56e0f9644
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
+@@ -0,0 +1,2245 @@
++/*
++ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
++ *               2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.set EDGE_SRC_STRIDE, 160
++
++@ PIC jump tables are fractionally more expensive than absolute in our code
++.set jent_pic, CONFIG_PIC
++
++
++.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
++        vshr.u8   q12, q8, #3
++        \I1
++        vadd.i8   q8, \Q_K128
++        \I2
++        vshr.u8   q13, q9, #3
++        \I3
++        vadd.i8   q9, \Q_K128
++        \I4
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT0, d25
++        vtbl.8    d26, \XLAT1, d26
++        vtbl.8    d27, \XLAT1, d27
++
++        vqadd.s8  q8, q12
++        vshr.u8   q12, q10, #3
++        vadd.i8   q10, \Q_K128
++        vqadd.s8  q9, q13
++        vshr.u8   q13, q11, #3
++        vadd.i8   q11, \Q_K128
++
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT0, d25
++        vtbl.8    d26, \XLAT1, d26
++        vtbl.8    d27, \XLAT1, d27
++        vqadd.s8  q10, q12
++        vsub.i8   q8, \Q_K128
++        vqadd.s8  q11, q13
++        vsub.i8   q9, \Q_K128
++        vsub.i8   q10, \Q_K128
++        vsub.i8   q11, \Q_K128
++.endm
++
++.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
++        \L1
++        \L2
++        \L3
++        \L4
++        \L5
++        vadd.i8   q12, q8, \Q_K128
++        vshr.u8   q8, #3
++        vtbl.8    d16, \XLAT0, d16
++        vtbl.8    d17, \XLAT1, d17
++        vqadd.s8  q12, q8
++        bmi       2f
++1:        \L1
++          \L2
++          \L3
++          \L4
++          \L5
++        vsub.i8   q13, q12, \Q_K128
++          vadd.i8   q12, q8, \Q_K128
++          vshr.u8   q8, #3
++        \S1
++        \S2
++        \S3
++        \S4
++          vtbl.8    d16, \XLAT0, d16
++          vtbl.8    d17, \XLAT1, d17
++          vqadd.s8  q12, q8
++          bpl       1b
++2:        vsub.i8   q13, q12, \Q_K128
++          \S1
++          \S2
++          \S3
++          \S4
++.endm
++
++
++.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
++        vmax.s16  \Q0, \Q_MIN
++        vmax.s16  \Q1, \Q_MIN
++        vmax.s16  \Q2, \Q_MIN
++        vmax.s16  \Q3, \Q_MIN
++        vmin.s16  \Q0, \Q_MAX
++        vmin.s16  \Q1, \Q_MAX
++        vmin.s16  \Q2, \Q_MAX
++        vmin.s16  \Q3, \Q_MAX
++.endm
++
++@ Clobbers q12, q13
++.macro sao_band_64b_16  Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
++        vshrn.i16 d24, \Q0, #(\bit_depth - 5)
++        vshrn.i16 d25, \Q1, #(\bit_depth - 5)
++        vshrn.i16 d26, \Q2, #(\bit_depth - 5)
++        \I1
++        vtbl.8    d24, \XLAT0, d24
++        vshrn.i16 d27, \Q3, #(\bit_depth - 5)
++        vtbl.8    d25, \XLAT1, d25
++        \I2
++        vtbl.8    d26, \XLAT0, d26
++        vtbl.8    d27, \XLAT1, d27
++        vaddw.s8  \Q0, d24
++        vaddw.s8  \Q1, d25
++        vaddw.s8  \Q2, d26
++        vaddw.s8  \Q3, d27
++        clip16_4   \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
++.endm
++
++@ Clobbers q10, q11, q12
++.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
++        \L1
++        \L2
++        \L3
++        \L4
++        \L5
++        vshrn.i16 d24, \Q0, #\bit_depth - 5
++        vshrn.i16 d25, \Q1, #\bit_depth - 5
++        vtbl.8    d24, \XLAT0, d24
++        vtbl.8    d25, \XLAT1, d25
++        vaddw.s8  q10, \Q0, d24
++        vaddw.s8  q11, \Q1, d25
++        bmi       2f
++1:        \L1
++          \L2
++          \L3
++          \L4
++          \L5
++        vmax.s16  q10, \Q_MIN
++        vmax.s16  q11, \Q_MIN
++          vshrn.i16 d24, \Q0, #\bit_depth - 5
++          vshrn.i16 d25, \Q1, #\bit_depth - 5
++        vmin.s16  q10, \Q_MAX
++        vmin.s16  q11, \Q_MAX
++        \S1
++        \S2
++        \S3
++        \S4
++          vtbl.8    d24, \XLAT0, d24
++          vtbl.8    d25, \XLAT1, d25
++          vaddw.s8  q10, \Q0, d24
++          vaddw.s8  q11, \Q1, d25
++          bpl       1b
++2:        vmax.s16  q10, \Q_MIN
++          vmax.s16  q11, \Q_MIN
++          vmin.s16  q10, \Q_MAX
++          vmin.s16  q11, \Q_MAX
++          \S1
++          \S2
++          \S3
++          \S4
++.endm
++
++
++@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
++@ so we are quite safe stuffing it into a byte array
++@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
++@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
++@ precision
++
++@ This, somewhat nasty, bit of code builds the {d0-d3} translation
++@ array via the stack
++@ Given that sao_left_class > 28 can cause wrap we can't just poke
++@ all 4 bytes in at once
++@
++@ It also loads other common regs
++
++@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
++function band_load_y
++        ldr       ip, [sp, #16]         @ &sao_offset_val[0]
++        ldr       r4, [sp, #20]         @ sao_left_class
++        vmov.i64  d4, #0
++        vmov.i64  q0, #0
++        pld       [r1]
++        vld2.8    {q8}, [ip]
++        sub       ip, sp, #8*5
++        vmov.i64  q1, #0
++        add       r4, ip, r4
++        vpush     {d0-d4}               @ Put zero array on stack
++        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
++        ldr       ip, [ip, #8*5 + 28]   @ height
++        vst1.32   {d16[0]}, [r4]
++        add       r4, r1, r3
++        vpop      {d0-d4}               @ Pop modified array
++        sub       ip, ip, #1
++        vorr      d0, d0, d4
++        bx        lr
++endfunc
++
++@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
++function band_load_c
++        ldr       ip, [sp, #16]         @ &sao_offset_val1[0]
++        ldr       r4, [sp, #20]         @ sao_left_class1
++        vmov.i64  d24, #0
++        vmov.i64  q10, #0
++        pld       [r1]
++        vld2.8    {q8}, [ip]
++        sub       ip, sp, #8*5
++        vmov.i64  q11, #0
++        add       r4, ip, r4
++        ldr       ip, [sp, #24]         @ &sao_offset_val2[0]
++        vpush     {d20-d24}             @ Put zero array on stack
++        vld2.8    {q9}, [ip]
++        vshr.u64  d16, d16, #8          @ 1st interesting val is [1]
++        ldr       ip, [sp, #8*5 + 28]   @ sao_left_class2
++        vst1.32   {d16[0]}, [r4]
++        add       ip, sp, ip
++        vshr.u64  d18, d18, #8          @ 1st interesting val is [1]
++        vldmia    sp, {d0-d3}           @ Load modified array
++        vldr      d16, [sp, #8*4]
++        add       r4, r1, r3
++        vstmia    sp, {d20-d24}         @ Put zero array on stack (again)
++        vst1.32   {d18[0]}, [ip]
++        vorr      d0, d0, d16
++        vldmia    sp, {d4-d7}           @ Load modified array
++        vldr      d18, [sp, #8*4]
++        ldr       ip, [sp, #8*5 + 36]   @ height
++        add       sp, sp, #8*5
++        vorr      d4, d4, d18
++        sub       ip, ip, #1
++        bx        lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_64_neon_8, export=1
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vldmia    r1, {q8-q11}
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
++            "pld       [r4]",                 \
++            "subs      ip, #1",               \
++            "it ne; addne r4, r3",            \
++            "add       r1, r3"
++        vstmia    r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_32_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vld1.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #2
++        vld1.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++        vst1.8    { q8, q9 }, [r0, :128], r2
++        vst1.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_16_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_y
++
++1:      vld1.8    { q8}, [r1, :128], r3
++        subs      ip, #4
++        vld1.8    { q9}, [r6, :128], r3
++        vld1.8    {q10}, [r1, :128], r3
++        vld1.8    {q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d0-d3}, q15
++
++        vst1.8    { q8}, [r0, :128], r2
++        vst1.8    { q9}, [r5, :128], r2
++        vst1.8    {q10}, [r0, :128], r2
++        vst1.8    {q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_8 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++function ff_hevc_rpi_sao_band_8_neon_8, export=1
++        ldr       ip, [sp, #8]          @ width
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        cmp       ip, #8
++        bl        band_load_y
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        blt       4f
++
++        sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
++            "vld1.8    {d16}, [r1, :64], r3", \
++            "subs      ip, #2",               \
++            "vld1.8    {d17}, [r6, :64], r3", \
++            "",                               \
++            "",                               \
++            "vst1.8 {d26}, [r0, :64], r2",    \
++            "vst1.8 {d27}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++4:
++        sao_band_16b_8 {d0-d3}, {d0-d3}, q15,    \
++            "vld1.32   {d16[0]}, [r1, :32], r3", \
++            "subs      ip, #4",                  \
++            "vld1.32   {d16[1]}, [r6, :32], r3", \
++            "vld1.32   {d17[0]}, [r1, :32], r3", \
++            "vld1.32   {d17[1]}, [r6, :32], r3", \
++            "vst1.32   {d26[0]}, [r0, :32], r2", \
++            "vst1.32   {d26[1]}, [r5, :32], r2", \
++            "vst1.32   {d27[0]}, [r0, :32], r2", \
++            "vst1.32   {d27[1]}, [r5, :32], r2"
++        pop       {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_32_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        vmov.u8   q15, #128
++        bl        band_load_c
++
++1:      vld2.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #1
++        vld2.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
++            "pld       [r4]",                 \
++            "it ne; addne r4, r3"
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop     {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_16_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.u8   q15, #128
++        bl        band_load_c
++
++1:      vld2.8    { q8, q9 }, [r1, :128], r3
++        subs      ip, #2
++        vld2.8    {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_8 {d0-d3}, {d4-d7}, q15
++
++        vst2.8    { q8, q9 }, [r0, :128], r2
++        vst2.8    {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop     {r4-r6, pc}
++endfunc
++
++@ ff_hevc_rpi_sao_band_c_8_neon_8(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
++        ldr       ip, [sp, #16]         @ width
++        push      {r4-r6, lr}
++        vmov.u8   q15, #128
++        cmp       ip, #8
++        bl        band_load_c
++        blt       4f
++
++        sao_band_16b_8 {d0-d3}, {d4-d7}, q15,      \
++            "vld2.8    {d16-d17}, [r1, :128], r3", \
++            "subs      ip, #1",                    \
++            "",                                    \
++            "",                                    \
++            "",                                    \
++            "vst2.8    {d26-d27}, [r0, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
++            "vld1.8    {d16}, [r1, :64], r3", \
++            "subs      ip, #2",               \
++            "vld1.8    {d17}, [r6, :64], r3", \
++            "vuzp.8    d16, d17",             \
++            "",                               \
++            "vzip.8    d26, d27",             \
++            "vst1.8    {d26}, [r0, :64], r2", \
++            "vst1.8    {d27}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_64_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_64_16 bit_depth
++        push      {r4-r6, lr}
++        vmov.i64  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        bl        band_load_y
++        vpush     {q4-q7}
++
++1:      vldm      r1, {q4-q11}
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++            "subs      ip, #1",                                                  \
++            "add       r1, r3"
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
++        vstm      r0, {q4-q11}
++        add       r0, r2
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_64_neon_10, export=1
++        band_64_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_32_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_32_16 bit_depth
++        push      {r4-r6, lr}
++        vmov.i64  q2, #0
++        vmov.i16  q3, #(1 << \bit_depth) - 1
++        bl        band_load_y
++
++1:      vldm      r1, {q8-q11}
++        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
++            "subs      ip, #1",                                                   \
++            "add       r1, r3"
++        vstm      r0, {q8-q11}
++        add       r0, r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_32_neon_10, export=1
++        band_32_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_16_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_16_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_y
++
++1:      vld1.16   { q8, q9 }, [r1, :128], r3
++        subs      r12, #2
++        vld1.16   {q10, q11}, [r6, :128], r3
++        sao_band_64b_16 q8,  q9,  q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
++        vst1.16   { q8, q9 }, [r0, :128], r2
++        vst1.16   {q10, q11}, [r5, :128], r2
++        bpl       1b
++
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_16_neon_10, export=1
++        band_16_16 10
++endfunc
++
++@ ff_hevc_rpi_sao_band_8_neon_10 (
++@   uint8_t *_dst,              [r0]
++@   uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,       [r2]
++@   ptrdiff_t stride_src,       [r3]
++@   int16_t *sao_offset_val,    [sp, #0]
++@   int sao_left_class,         [sp, #4]
++@   int width,                  [sp, #8]
++@   int height)                 [sp, #12]
++
++.macro band_8_16 bit_depth
++        ldr       ip, [sp, #8]          @ width
++        push      {r4-r6, lr}
++        vmov.i64  q14, #0
++        cmp       ip, #8
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_y
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        blt       4f
++
++        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++            "vld1.16   {q8}, [r1, :128], r3",                           \
++            "subs      ip, #2",                                         \
++            "vld1.16   {q9}, [r6, :128], r3",                           \
++            "",                                                         \
++            "",                                                         \
++            "vst1.16   {q10}, [r0, :128], r2",                          \
++            "vst1.16   {q11}, [r5, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
++            "vld1.16   {d16}, [r1, :64], r3",                           \
++            "subs      ip, #4",                                         \
++            "vld1.16   {d17}, [r6, :64], r3",                           \
++            "vld1.16   {d18}, [r1, :64], r3",                           \
++            "vld1.16   {d19}, [r6, :64], r3",                           \
++            "vst1.16   {d20}, [r0, :64], r2",                           \
++            "vst1.16   {d21}, [r5, :64], r2",                           \
++            "vst1.16   {d22}, [r0, :64], r2",                           \
++            "vst1.16   {d23}, [r5, :64], r2"
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_8_neon_10, export=1
++        band_8_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_32_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_32_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        sub       r2, #64
++        sub       r3, #64
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++        mov       lr, #64
++        vpush     {q4-q7}
++
++1:      vld2.16   { q4, q5 }, [r1, :128], lr
++        subs      ip, #1
++        vld2.16   { q6, q7 }, [r6, :128], lr
++        vld2.16   { q8, q9 }, [r1, :128], r3
++        vld2.16   {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "pld       [r4]",                                                      \
++            "it ne; addne r4, r3"
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++        vst2.16   { q4, q5 }, [r0, :128], lr
++        vst2.16   { q6, q7 }, [r5, :128], lr
++        vst2.16   { q8, q9 }, [r0, :128], r2
++        vst2.16   {q10, q11}, [r5, :128], r2
++
++        bpl       1b
++
++        vpop      {q4-q7}
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
++        band_c_32_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_16_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_16_16 bit_depth
++        push      {r4-r6, lr}
++        add       r5, r0, #32
++        add       r6, r1, #32
++        vmov.i64  q14, #0
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++
++1:      vld2.16   { q8, q9 }, [r1, :128], r3
++        subs      ip, #1
++        vld2.16   {q10, q11}, [r6, :128], r3
++
++        sao_band_64b_16 q4,  q5,  q6,  q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++        sao_band_64b_16 q8,  q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
++
++        vst2.16   { q8, q9 }, [r0, :128], r2
++        vst2.16   {q10, q11}, [r5, :128], r2
++
++        bpl       1b
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
++        band_c_16_16 10
++endfunc
++
++
++@ ff_hevc_rpi_sao_band_c_8_neon_10(
++@   uint8_t * dst          [r0]
++@   uint8_t * src          [r1]
++@   uint32_t dst_stride    [r2]
++@   uint32_t src_stride    [r3]
++@   const int16_t * table1 sp[0]
++@   uint32_t offset1       sp[4]
++@   const int16_t * table2 sp[8]
++@   uint32_t offset2       sp[12]
++@   int width              sp[16]
++@   int height             sp[20]
++
++.macro band_c_8_16 bit_depth
++        ldr       ip, [sp, #16]         @ width
++        push      {r4-r6, lr}
++        vmov.i64  q14, #0
++        cmp       ip, #8
++        vmov.i16  q15, #(1 << \bit_depth) - 1
++        bl        band_load_c
++        blt       4f
++
++        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "vld2.16   {q8,q9}, [r1, :128], r3",                        \
++            "subs      ip, #1",                                         \
++            "",                                                         \
++            "",                                                         \
++            "",                                                         \
++            "vst2.16   {q10,q11}, [r0, :128], r2"
++        pop       {r4-r6, pc}
++4:
++        add       r5, r0, r2
++        add       r6, r1, r3
++        lsl       r2, #1
++        lsl       r3, #1
++        sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
++            "vld2.16   {d16,d18}, [r1, :128], r3",                      \
++            "subs      ip, #2",                                         \
++            "vld2.16   {d17,d19}, [r6, :128], r3",                      \
++            "",                                                         \
++            "",                                                         \
++            "vst2.16   {d20,d22}, [r0, :128], r2",                      \
++            "vst2.16   {d21,d23}, [r5, :128], r2"
++        pop       {r4-r6, pc}
++.endm
++
++function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
++        band_c_8_16 10
++endfunc
++
++
++@ =============================================================================
++@ SAO EDGE
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_8
++
++        vcgt.u8 q12,  q4,  q0   @ c > a -> -1 , otherwise 0
++        vcgt.u8 q13,  q5,  q1
++        vcgt.u8 q14,  q6,  q2
++        vcgt.u8 q15,  q7,  q3
++
++        vcgt.u8  q0,  q4        @ a > c -> -1 , otherwise 0
++        vcgt.u8  q1,  q5
++        vcgt.u8  q2,  q6
++        vcgt.u8  q3,  q7
++
++        vsub.s8  q0,  q12       @ a = sign(c-a)
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
++
++        vcgt.u8  q12, q4,  q8   @ c > b -> -1 , otherwise 0
++        vcgt.u8  q13, q5,  q9
++        vcgt.u8  q14, q6,  q10
++        vcgt.u8  q15, q7,  q11
++
++        vsub.s8  q0,  q12
++        vsub.s8  q1,  q13
++        vsub.s8  q2,  q14
++        vsub.s8  q3,  q15
++
++        vcgt.u8  q12, q8,  q4   @ c < b -> -1 , otherwise 0
++        vcgt.u8  q13, q9,  q5
++        vcgt.u8  q14, q10, q6
++        vcgt.u8  q15, q11, q7
++
++        vadd.s8  q0,  q12       @ a = sign(c-a) + sign(c-b)
++        vadd.s8  q1,  q13
++        vmov.u8  q12, #2
++        vadd.s8  q2,  q14
++        vadd.s8  q3,  q15
++
++        vadd.s8  q0,  q12
++        vadd.s8  q1,  q12
++
++        vld1.8   {d26, d27}, [r5]
++
++        vadd.s8  q2,  q12
++        vuzp.8   q0,  q1
++        vmov.u8  q15, #128
++        vadd.s8  q3,  q12       @ a = 2 + sign(c-a) + sign(c-b)
++
++        vtbl.8   d0,  {d26}, d0
++        vadd.s8  q12, q4, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d1,  {d26}, d1
++        vadd.s8  q14, q5, q15
++
++        vtbl.8   d2,  {d27}, d2
++        vuzp.8   q2,  q3
++
++        vtbl.8   d3,  {d27}, d3
++
++        vtbl.8   d4,  {d26}, d4
++        vzip.8   q0,  q1
++
++        vtbl.8   d5,  {d26}, d5
++        vqadd.s8 q0,  q12
++        vqadd.s8 q1,  q14
++        vadd.s8  q12, q6, q15   @ Add -128 so we can use saturating signed add
++
++        vtbl.8   d6,  {d27}, d6
++        vtbl.8   d7,  {d27}, d7
++        vadd.s8  q14, q7, q15   @ Add -128 so we can use saturating signed add
++        vzip.8   q2,  q3
++
++        vsub.s8  q0,  q15
++        vqadd.s8 q2,  q12
++        vqadd.s8 q3,  q14
++        vsub.s8  q1,  q15
++        vsub.s8  q2,  q15
++        vsub.s8  q3,  q15
++
++        bx      lr
++endfunc
++
++@ r0    destination address
++@ r2    stride to post-increment r0 with
++@ r4    upper clip value
++@ [r5]  translate values
++@
++@ a <- c <- b
++@ a in q0 - q3
++@ c in q4 - q7
++@ b in q8 - q11
++@
++@ q12-15 used as temp
++@
++@ Can be used for both Y & C as we unzip/zip the deltas and
++@ transform "u/v" separately via d26/d27.  For Y d26=d27
++
++function edge_64b_body_16
++
++        vcgt.u16 q12, q4, q0  // c > a -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q1
++        vcgt.u16 q14, q6, q2
++        vcgt.u16 q15, q7, q3
++
++        vcgt.u16 q0, q0, q4  // a > c -> -1 , otherwise 0
++        vcgt.u16 q1, q1, q5
++        vcgt.u16 q2, q2, q6
++        vcgt.u16 q3, q3, q7
++
++        vsub.s16 q0, q0, q12 // a = sign(c-a)
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q4, q8  // c > b -> -1 , otherwise 0
++        vcgt.u16 q13, q5, q9
++        vcgt.u16 q14, q6, q10
++        vcgt.u16 q15, q7, q11
++
++        vsub.s16 q0, q0, q12
++        vsub.s16 q1, q1, q13
++        vsub.s16 q2, q2, q14
++        vsub.s16 q3, q3, q15
++
++        vcgt.u16 q12, q8, q4  // c < b -> -1 , otherwise 0
++        vcgt.u16 q13, q9, q5
++        vcgt.u16 q14, q10, q6
++        vcgt.u16 q15, q11, q7
++
++        vadd.s16 q0, q0, q12  // a = sign(c-a) + sign(c-b)
++        vadd.s16 q1, q1, q13
++        vadd.s16 q2, q2, q14
++        vadd.s16 q3, q3, q15
++
++        vmov.u8  q12, #2
++
++        vmovn.s16 d0, q0
++        vmovn.s16 d1, q1
++        vmovn.s16 d2, q2
++        vmovn.s16 d3, q3
++
++        vldr     d26, [r5]
++
++        vuzp.8   q0, q1
++
++        vldr     d27, [r5, #8]
++
++        vadd.s8  q0, q0, q12
++        vadd.s8  q1, q1, q12
++
++        vmov.i64 q12, #0
++
++        vtbl.8   d0, {d26}, d0
++        vtbl.8   d1, {d26}, d1
++        vtbl.8   d2, {d27}, d2
++        vtbl.8   d3, {d27}, d3
++
++        vdup.i16 q13, r4
++
++        vzip.8   q0, q1
++
++        @ Avoid overwrite whilst widening
++        vaddw.s8 q2, q6, d2
++        vaddw.s8 q3, q7, d3
++        vaddw.s8 q1, q5, d1
++        vaddw.s8 q0, q4, d0
++
++        @ now clip
++        clip16_4 q2, q3, q1, q0, q12, q13
++
++        bx       lr
++endfunc
++
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3, q9, q10
++@
++@ d16, d17 (q8) xlat U, V
++@ q14.u8 #2
++@ q15.u8 #128
++
++function edge_16b_body_8
++        vcgt.u8  q9,  q0,  q1   @ a > c -> -1 , otherwise 0
++        vadd.u8  q9,  q14, q9
++        vcgt.u8  q0,  q1,  q0   @ c > a -> -1 , otherwise 0
++        vsub.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q2,  q1   @ c < b -> -1 , otherwise 0
++        vadd.u8  q9,  q9,  q0
++        vcgt.u8  q0,  q1,  q2   @ c > b -> -1 , otherwise 0
++        vsub.u8  q0,  q9,  q0
++
++        vadd.s8  q3,  q1, q15   @ Add -128 so we can use saturating signed add
++
++        vuzp.8   d0,  d1
++
++        vtbl.8   d0,  {d16}, d0
++        vtbl.8   d1,  {d17}, d1
++
++        vzip.8   d0,  d1
++        vqadd.s8 q0,  q3
++        vsub.s8  q0,  q15
++
++        bx      lr
++endfunc
++
++@ a <- c <- b
++@ a in q0
++@ c in q1
++@ b in q2
++@ Temp q3
++@
++@ q12, #0
++@ d16, d17 xlat U, V
++@ q14.u8 #2
++@ q15.u16 max
++function edge_16b_body_16
++        vcgt.u16 q9, q0, q1     @ a > c -> -1 , otherwise 0
++        vadd.u16 q9, q14, q9
++        vcgt.u16 q0, q1, q0     @ c > a -> -1 , otherwise 0
++        vsub.u16 q9, q9, q0
++        vcgt.u16 q0, q2, q1     @ c < b -> -1 , otherwise 0
++        vadd.u16 q9, q9, q0
++        vcgt.u16 q0, q1, q2     @ c > b -> -1 , otherwise 0
++        vsub.u16 q0, q9, q0
++
++        vmovn.s16 d0, q0
++        @ d1 will have random contents that we transform but
++        @ that doesn't matter as we then discard them
++        vuzp.8   d0, d1
++
++        vtbl.8   d0, {d16}, d0
++        vtbl.8   d1, {d17}, d1
++
++        vzip.8   d0, d1
++
++        vaddw.s8 q0, q1, d0
++
++        @ now clip
++        vmax.s16 q0, q12
++        vmin.s16 q0, q15
++        bx       lr
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]   // Chroma only
++@   int eo,                           [sp, #sp_base + 0]
++@   int width,                        [sp, #sp_base + 4]
++@   int height)                       [sp, #sp_base + 8]
++
++@ Jumps via jump_tab with
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   EDGE_SRC_STRIDE                   [r3]
++@   (1 << \bit_depth) - 1             [r4]
++@   * xlat_table                      [r5]  // setup_64b only
++@   int height                        [r12]
++@
++@   0                                 [q12] // > 8 bit
++@   2                                 [q14]
++@   128                               [q15] // = 8 bit
++@   r4                                [q15] // > 8 bit
++
++.macro  edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
++
++@ Build translate registers
++@ As translate values can only be 0-4 we don't care about junk in the rest
++@ of the register
++.if \is_chroma
++        ldr      ip, [sp, #0]
++        push     {r4-r6, lr}    @ 16 bytes
++        vld1.8   {d16[2]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[2]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[1]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[3]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[3]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[4]}, [r3]
++        vld1.8   {d17[4]}, [ip]
++        movw     r3, EDGE_SRC_STRIDE
++.set sp_base, 20
++.else
++        add      ip, r3, #4
++        vld1.8   {d16[1]}, [r3]
++        add      r3, r3, #2
++        vld1.8   {d17[0]}, [ip]
++        add      ip, ip, #2
++        vld1.8   {d16[0]}, [r3]
++        add      r3, r3, #6
++        vld1.8   {d17[1]}, [ip]
++        vld1.8   {d16[2]}, [r3]
++        movw     r3, EDGE_SRC_STRIDE
++        push     {r4-r6, lr}    @ 16 bytes
++        vzip.8   d16, d17
++        vmov     d17, d16
++.set sp_base, 16
++.endif
++
++@ If setup_64b we need the xlat table on the stack
++.if \setup_64b
++        sub      r5, sp, #16
++.endif
++
++@ Get jump address
++@ We have a special case for width 4 as the calling code doesn't detect it
++@ If we may have w4 then we add a 2nd jump table after the 1st
++.if \check_w4
++        ldr      r12, [sp, #sp_base + 4]        @ width
++        adr      r6, \jump_tab
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        cmp      r12, #8
++        it lt
++        addlt    r6, #16
++.else
++        ldr      lr, [sp, #sp_base + 0]        @ e0
++        adr      r6, \jump_tab
++.endif
++
++        ldr      r12, [sp, #sp_base + 8]        @ height
++
++.if \bit_depth > 8
++        movw     r4, (1 << \bit_depth) - 1
++.endif
++.if \setup_16b
++.if \bit_depth > 8
++        vmov.i64 q12, #0
++        vdup.16  q15, r4
++        vmov.u16 q14, #2
++.else
++        vmov.u8  q15, #128
++        vmov.u8  q14, #2
++.endif
++.endif
++
++@ If setup_64b we need q4-q7 saved.
++.if \setup_64b
++        vpush    {q4-q8}        @ 80 bytes, q8 pushed first
++.set sp_base, sp_base + 80
++.endif
++
++        ldr      r6, [r6, lr, lsl #2]
++
++@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
++.if \do2
++        push     {r0, r1, r6, r12}
++.if jent_pic
++        bl       98f
++.else
++        blx      r6
++.endif
++        pop      {r0, r1, r6, r12}
++
++        add      r0, #64
++        add      r1, #64
++.endif
++
++.if jent_pic
++        bl       98f
++.else
++        blx      r6
++.endif
++
++@ Tidy up & return
++.if \setup_64b
++        vpop     {q4-q8}        @ spurious but harmless load of q8
++.endif
++        pop      {r4-r6, pc}
++
++.if jent_pic && !\xjump
++@ Magic label - used as 98b in jent macro
++98:
++        add      pc, r6
++.endif
++.endm
++
++
++.macro  edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
++.endm
++
++.macro  edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
++        edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
++.endm
++
++
++.macro  edge_64b_e0, body_fn, pb
++        sub      r1, #8
++        mov      r6, lr
++1:      vldm     r1, {d7-d16}
++        // load a
++        vext.8   q0,  q3,  q4, #(16 - \pb)
++        add      r1, r3
++        vext.8   q1,  q4,  q5, #(16 - \pb)
++        subs     r12, #1
++        vext.8   q2,  q5,  q6, #(16 - \pb)
++        vext.8   q3,  q6,  q7, #(16 - \pb)
++        pld      [r1]
++        // load b
++        vext.8   q11, q7,  q8, #\pb     @ Avoid overwrite
++        pld      [r1, #64]
++        vext.8   q8,  q4,  q5, #\pb
++        vext.8   q9,  q5,  q6, #\pb
++        vext.8   q10, q6,  q7, #\pb
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        add      r0, r0, r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_32bx2_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d7-d12}
++        // load a
++        vext.8   q0, q3, q4, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q4, q5, #16 - \pb
++        subs     r12, #2
++        // load b
++        vext.8   q8, q4, q5, #\pb
++        vext.8   q9, q5, q6, #\pb
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d12-d15}
++        vldr     d26, [r6, #32]
++        // load a
++        vext.8   q2, q12, q6, #16 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q6, q7, #16 - \pb
++        // load b
++        vext.8   q10, q6, q7, #\pb
++        vext.8   q11, q7, q13, #\pb
++        bl       \body_fn
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        bgt      1b
++        pop      {r7,pc}
++.endm
++
++.macro  edge_16b_e0, body_fn, pb
++        sub      r1, #8
++        mov      r6, lr
++1:      vldmia   r1, {d1-d4}
++        add      r1, r3
++        subs     r12, #1
++        vext.8   q0, q0, q1, #16 - \pb
++        vext.8   q2, q1, q2, #\pb
++
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        bgt      1b
++        bx       r6
++.endm
++
++.macro  edge_8bx2_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        sub      r1, #8
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      vldmia   r1, {d1-d2}
++        vldmia   r6, {d3-d4}
++        vldr     d6, [r1, #16]
++        subs     r12, #2
++        vldr     d7, [r6, #-8]
++        add      r1, r1, r3, lsl #1
++        vext.8   d0, d1, d2, #8 - \pb
++        add      r6, r6, r3, lsl #1
++        vext.8   d5, d3, d4, #\pb
++        vext.8   d4, d2, d6, #\pb
++        vext.8   d1, d7, d3, #8 - \pb
++
++        bl       \body_fn
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        bgt      1b
++        pop      {r7,pc}
++.endm
++
++.macro  edge_4bx4_e0, body_fn, pb
++        add      r6, r1, r3
++        push     {r7,lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++
++        tst      r1, #4
++        bne      2f
++1:      // r1 (and assumed r6) are 64-bit aligned
++        vldr     d2, [r1]
++        vldr     d0, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d20, [r6]
++        subs     r12, #4
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d3, [r1]
++        vshr.u64 d4, d2, #\pb * 8
++        vldr     d1, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        vldr     d21, [r6]
++        vext.8   d0, d0, d2, #8 - \pb
++        vldr     d19, [r6,#-8]
++        add      r6, r6, r3, lsl #1
++        vshr.u64 d22, d20, #\pb * 8
++        vext.8   d18, d18, d20, #8 - \pb
++        vshr.u64 d5, d3, #\pb * 8
++        vext.8   d1, d1, d3, #8 - \pb
++        vshr.u64 d23, d21, #\pb * 8
++        vext.8   d19, d19, d21, #8 - \pb
++        vsli.64  q1, q10, #32
++        vsli.64  q2, q11, #32
++        vsli.64  q0, q9, #32
++
++        bl       \body_fn
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      1b
++        pop      {r7,pc}
++
++2:      // r1 (and assumed r6) are 32-bit but not 64-bit aligned
++        vldr     d20, [r1, #-4]
++        vldr     d22, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d2, [r6, #-4]
++        subs     r12, #4
++        vldr     d4, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vldr     d21, [r1, #-4]
++        vshl.i64 d18, d20, #\pb * 8
++        vldr     d23, [r1, #4]
++        add      r1, r1, r3, lsl #1
++        vldr     d3, [r6, #-4]
++        vext.8   d22, d20, d22, #\pb
++        vldr     d5, [r6, #4]
++        add      r6, r6, r3, lsl #1
++        vshl.i64 d0, d2, #\pb * 8
++        vext.8   d4, d2, d4, #\pb
++        vshl.i64 d19, d21, #\pb * 8
++        vext.8   d23, d21, d23, #\pb
++        vshl.i64 d1, d3, #\pb * 8
++        vext.8   d5, d3, d5, #\pb
++        vsri.64  q1, q10, #32
++        vsri.64  q0, q9, #32
++        vsri.64  q2, q11, #32
++
++        bl       \body_fn
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      2b
++        pop      {r7,pc}
++.endm
++
++
++.macro  edge_64b_e1, body_fn
++        sub      r1, r3
++        push     {lr}
++        add      r6, r1, #32
++        // load a
++        vld1.8   {q0-q1}, [r1, :256], r3
++        vld1.8   {q2-q3}, [r6, :256], r3
++        // load c
++        vld1.8   {q4-q5}, [r1, :256], r3
++        vld1.8   {q6-q7}, [r6, :256], r3
++1:      // load b
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #1
++        vld1.8   {q10-q11}, [r6, :256], r3
++        bl       \body_fn
++        vstm     r0, {q0-q3}
++        // copy c to a
++        vmov.64  q0, q4
++        pld      [r1, r3]
++        vmov.64  q1, q5
++        it       le
++        pople    {lr}
++        vmov.64  q2, q6
++        it       le
++        bxle     lr
++        vmov.64  q3, q7
++        add      r0, r0, r2
++        // copy b to c
++        vmov.64  q4, q8
++        vmov.64  q5, q9
++        vmov.64  q6, q10
++        vmov.64  q7, q11
++        b        1b
++.endm
++
++.macro  edge_32bx2_e1, body_fn
++        sub      r6, r1, r3
++        vld1.8   {q2-q3}, [r1, :256], r3
++        vld1.8   {q0-q1}, [r6, :256]
++        mov      r6, lr
++
++1:      @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.8   {q8-q9}, [r1, :256], r3
++        subs     r12, #2
++        vmov     q4, q2
++        vmov     q5, q3
++        vld1.8   {q10-q11}, [r1, :256], r3
++        vmov     q6, q8
++        vmov     q7, q9
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        // copy b to a
++        vmov     q0, q8
++        vmov     q1, q9
++        vst1.8   {q2-q3}, [r0, :256], r2
++        vmov     q2, q10
++        it       le
++        bxle     r6
++        vmov     q3, q11
++        b        1b
++.endm
++
++.macro  edge_16b_e1, body_fn
++        sub      r6, r1, r3
++        // load c
++        vld1.8   {q1}, [r1, :128], r3
++        // load a
++        vld1.8   {q0}, [r6, :128]
++        mov      r6, lr
++1:      // load b
++        vld1.8   {q2}, [r1, :128], r3
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        subs     r12, #1
++        // copy c to a
++        vmov.64  q0, q1
++        it       le
++        bxle     r6
++        // copy b to c
++        vmov.64  q1, q2
++        b        1b
++.endm
++
++.macro  edge_8bx2_e1, body_fn
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.8   {d1}, [r1, :64], r3
++        vld1.8   {d0}, [r6, :64], r3
++        add      r7, r0, r2
++        lsl      r2, #1
++1:      @ Given the data duplication here we could obviously do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        vld1.8   {d4}, [r6, :64], r3
++        vmov     d2, d1
++        vld1.8   {d5}, [r1, :64], r3
++        subs     r12, #2
++        vmov     d3, d4
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++
++        // copy b to a
++        vmov     q0, q2
++        bgt      1b
++        pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e1, body_fn
++        sub      r6, r1, r3
++        lsl      r3, #1
++        push     {r7, lr}
++        vld1.32  {d0[1]}, [r1, :32], r3
++        add      r7, r0, r2
++        vld1.32  {d0[0]}, [r6, :32], r3
++        lsl      r2, #1
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d4[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vmov     d1, d4
++        vext.32  d2, d0, d4, #1
++        subs     r12, #4
++        vmov     d22, d5
++        vext.32  d3, d4, d5, #1
++        b        2f
++
++1:      vst1.32  {d0[0]}, [r0, :32], r2
++        vext.32  d2, d22, d4, #1
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vmov     d0, d22
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vext.32  d3, d4, d5, #1
++        vst1.32  {d1[1]}, [r7, :32], r2
++        vmov     d1, d4
++        vmov     d22, d5
++2:      @ Given the data duplication here we could probably do better than
++        @ using the generic body_fn but it almost certainly isn't worth it
++        bl       \body_fn
++        ble      3f
++        vld1.32  {d4[0]}, [r6, :32], r3
++        subs     r12, #4
++        vld1.32  {d4[1]}, [r1, :32], r3
++        vld1.32  {d5[0]}, [r6, :32], r3
++        vld1.32  {d5[1]}, [r1, :32], r3
++        b        1b
++
++3:      vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32]
++        vst1.32  {d1[1]}, [r7, :32]
++        pop      {r7, pc}
++.endm
++
++.macro  edge_64b_e2, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldr     d25, [r6, #-8]
++        vldmia   r6, {d16-d23}
++        vext.8   q0, q12, q8, #16 - \pb
++        add      r6, r1, #32
++        vext.8   q1, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q2, q9, q10, #16 - \pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q10, q11, #16 - \pb
++
++1:      // load b
++        vldmia   r1, {d16-d24}
++        vext.8   q8, q8, q9, #\pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #\pb
++        subs     r12, #1
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
++        bl       \body_fn
++        // next a is mostly available in c
++        vldr     d25, [r6, #-8]
++        vstmia   r0, {q0-q3}
++        vext.8   q3, q6, q7, #16 - \pb
++        it       le
++        pople    {lr}
++        vext.8   q2, q5, q6, #16 - \pb
++        it       le
++        bxle     lr
++        vext.8   q1, q4, q5, #16 - \pb
++        add      r6, r6, r3
++        vext.8   q0, q12, q4, #16 - \pb
++        add      r0, r0, r2
++        // next c is mostly available in b
++        vldr     d8, [r1]
++        vext.8   d9, d16, d17, #8 - \pb
++        vext.8   q5, q8, q9, #16 - \pb
++        add      r1, r1, r3
++        vext.8   q6, q9, q10, #16 - \pb
++        pld      [r6, #-8]
++        vext.8   q7, q10, q11, #16 - \pb
++        b        1b
++.endm
++
++.macro  edge_32bx2_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vld1.8   {q4-q5}, [r1, :256]
++        vldr     d25, [r6, #-8]
++        vld1.8   {q13-q14}, [r6, :256]
++        vldr     d31, [r1, #-8]
++        add      r6, r6, r3, lsl #1
++        vext.8   q0, q12, q13, #16 - \pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q1, q13, q14, #16 - \pb
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++1:
++        // load second 32b of c and second 32b of b
++        vldmia   r6, {d12-d16}
++        vldmia   r1, {d20-d24}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q9, q7, q8, #\pb
++        subs     r12, #2
++        vext.8   q8, q6, q7, #\pb
++        vext.8   q10, q10, q11, #\pb
++        vext.8   q11, q11, q12, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
++
++        vldr     d25, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d8, [r1]
++        vext.8   d9, d20, d21, #8 - \pb
++        vldr     d31, [r1, #-8]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q1, q6, q7, #16 - \pb
++        vext.8   q0, q12, q6, #16 - \pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q5, q10, q11, #16 - \pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q2, q15, q4, #16 - \pb
++        vext.8   q3, q4, q5, #16 - \pb
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_16b_e2, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        vld1.8   {q1}, [r1, :128], r3
++        vldr     d19, [r6, #-8]
++        vld1.8   {q10}, [r6, :128], r3
++
++1:      vldmia   r1, {d4-d6}
++        vext.8   q0, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q2, q2, q3, #\pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q10, q1
++        vldr     d2, [r1]
++        add      r1, r1, r3
++        vldr     d19, [r6, #-8]
++        add      r6, r6, r3
++        vext.8   d3, d4, d5, #8 - \pb
++        b        1b
++
++2:      pop      {pc}
++.endm
++
++.macro  edge_8bx2_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vldr     d18, [r6, #-8]
++        vldr     d19, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
++
++1:      vext.8   d0, d18, d19, #8 - \pb
++        vext.8   d4, d3, d4, #\pb
++        vext.8   d1, d20, d2, #8 - \pb
++        subs     r12, #2
++        vext.8   d5, d21, d22, #\pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
++
++        vldr     d18, [r6, #-8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #-8]
++        vmov     d19, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldmia   r6, {d3-d4}
++        vld1.8   {d21-d22}, [r1, :128]
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e2, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7-r9, lr}
++        add      r8, r1, r3
++        sub      r6, r6, #\pb
++        add      r8, r8, #\pb
++        add      r7, r0, r2
++        lsl      r2, #1
++
++1:      vld1.32  {d0[0]}, [r6], r3
++        subs     r12, #4
++        vld1.32  {d2[0]}, [r1], r3
++        vld1.32  {d4[0]}, [r8], r3
++        vld1.32  {d0[1]}, [r6], r3
++        vld1.32  {d2[1]}, [r1], r3
++        vld1.32  {d4[1]}, [r8], r3
++        vld1.32  {d1[0]}, [r6], r3
++        vld1.32  {d3[0]}, [r1], r3
++        vld1.32  {d5[0]}, [r8], r3
++        vld1.32  {d1[1]}, [r6], r3
++        vld1.32  {d3[1]}, [r1], r3
++        vld1.32  {d5[1]}, [r8], r3
++
++        bl       \body_fn
++
++        vst1.32  {d0[0]}, [r0, :32], r2
++        vst1.32  {d0[1]}, [r7, :32], r2
++        vst1.32  {d1[0]}, [r0, :32], r2
++        vst1.32  {d1[1]}, [r7, :32], r2
++        bgt      1b
++
++        pop      {r7-r9,pc}
++.endm
++
++.macro  edge_64b_e3, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        // load c and a
++        vld1.8   {q4-q5}, [r1, :128]
++        vldmia   r6, {d16-d24}
++        vext.8   q0, q8, q9, #\pb
++        add      r6, r1, #32
++        vext.8   q1, q9, q10, #\pb
++        add      r1, r1, r3
++        vext.8   q2, q10, q11, #\pb
++        vld1.8   {q6-q7}, [r6, :128]
++        sub      r6, r1, r3
++        vext.8   q3, q11, q12, #\pb
++
++1:      // load b
++        vldr     d17, [r1, #-8]
++        vldmia   r1, {d18-d25}
++        vext.8   q8, q8, q9, #16 - \pb
++        pld      [r1, r3]
++        vext.8   q9, q9, q10, #16 - \pb
++        subs     r12, #1
++        vext.8   q10, q10, q11, #16 - \pb
++        vext.8   q11, q11, q12, #16 - \pb
++        bl       \body_fn
++        // next a is mostly available in c
++        vldr     d24, [r6, #64]
++        vstmia   r0, {q0-q3}
++        vext.8   q0, q4, q5, #\pb
++        it       le
++        pople    {lr}
++        vext.8   q1, q5, q6, #\pb
++        it       le
++        bxle     lr
++        vext.8   q2, q6, q7, #\pb
++        add      r6, r6, r3
++        vext.8   q3, q7, q12, #\pb
++        add      r0, r0, r2
++        // next c is mostly available in b
++        vext.8   d14, d22, d23, #\pb
++        vldr     d15, [r1, #56]
++        vext.8   q4, q8, q9, #\pb
++        add      r1, r1, r3
++        vext.8   q5, q9, q10, #\pb
++        vext.8   q6, q10, q11, #\pb
++        b        1b
++.endm
++
++.macro  edge_32bx2_e3, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        // load a and first 32b of c
++        vldmia   r1, {d8-d12}
++        vldmia   r6, {d24-d28}
++        vext.8   q2, q4, q5, #\pb
++        add      r6, r6, r3, lsl #1
++        vext.8   q3, q5, q6, #\pb
++        add      r1, r1, r3, lsl #1
++        vext.8   q0, q12, q13, #\pb
++        vext.8   q1, q13, q14, #\pb
++1:
++        // load second 32b of c and second 32b of b
++        vldr     d25, [r6, #-8]
++        subs     r12, #2
++        vldmia   r6, {d12-d15}
++        vldr     d27, [r1, #-8]
++        vldmia   r1, {d20-d23}
++        // first 32b of b is mostly available in second 32b of c
++        vext.8   q8, q12, q6, #16 - \pb
++        vext.8   q9, q6, q7, #16 - \pb
++        vext.8   q11, q10, q11, #16 - \pb
++        vext.8   q10, q13, q10, #16 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {q0-q1}, [r0, :256], r2
++        vst1.8   {q2-q3}, [r7, :256], r2
++        ble      2f
++
++        vldr     d24, [r6, #32]
++        add      r6, r6, r3, lsl #1
++        vldr     d11, [r1, #24]
++        vext.8   d10, d22, d23, #\pb
++        vldr     d30, [r1, #32]
++        add      r1, r1, r3, lsl #1
++        // first 32b of a is mostly available in second 32b of c
++        vext.8   q0, q6, q7, #\pb
++        vext.8   q1, q7, q12, #\pb
++        // first 32b of c is mostly available in second 32b of b
++        vext.8   q4, q10, q11, #\pb
++        // second 32b of a is mostly available in first 32b of c
++        vext.8   q3, q5, q15, #\pb
++        vext.8   q2, q4, q5, #\pb
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_16b_e3, body_fn, pb
++        push     {lr}
++        sub      r6, r1, r3
++        vld1.8   {q1}, [r1, :128], r3
++        vldmia   r6, {d18-d20}
++        add      r6, r6, r3
++
++1:      vldr     d5, [r1, #-8]
++        vld1.8   {q3}, [r1, :128]
++        subs     r12, #1
++        vext.8   q0, q9, q10, #\pb
++        vext.8   q2, q2, q3, #16 - \pb
++        bl       \body_fn
++        vst1.8   {q0}, [r0, :128], r2
++        ble      2f
++        vmov     q9, q1
++        vldr     d3, [r1, #8]
++        add      r1, r1, r3
++        vldr     d20, [r6, #16]
++        add      r6, r6, r3
++        vext.8   d2, d4, d5, #\pb
++        b        1b
++
++2:      pop      {pc}
++.endm
++
++.macro  edge_8bx2_e3, body_fn, pb
++        sub      r6, r1, r3
++        push     {r7, lr}
++        add      r7, r0, r2
++        lsl      r2, #1
++        vld1.8   {d18-d19}, [r6]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++
++1:      vext.8   d0, d18, d19, #\pb
++        vext.8   d4, d4, d3, #8 - \pb
++        vext.8   d1, d2, d20, #\pb
++        subs     r12, #2
++        vext.8   d5, d21, d22, #8 - \pb
++
++        bl       \body_fn
++
++        vst1.8   {d0}, [r0, :64], r2
++        vst1.8   {d1}, [r7, :64], r2
++        ble      2f
++
++        vldr     d19, [r6, #8]
++        add      r6, r6, r3, lsl #1
++        vldr     d20, [r1, #8]
++        vmov     d18, d3
++        vldr     d2, [r1]
++        add      r1, r1, r3, lsl #1
++        vldr     d4, [r6, #-8]
++        vldr     d3, [r6]
++        vldr     d21, [r1, #-8]
++        vldr     d22, [r1]
++        b        1b
++
++2:      pop      {r7, pc}
++.endm
++
++.macro  edge_4bx4_e3, body_fn, pb
++        @ e3 is the same as e2 but with the X offset reversed
++        edge_4bx4_e2 \body_fn, (-\pb)
++.endm
++
++@ Jump table entry - if in neon mode the bottom bit must be set
++@ ? There is probably a real asm instruction to do this but I haven't found it
++.macro jent lab
++.if jent_pic
++@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
++@ simpler and clearer in the code to stick with .word
++T       .word  (0 + \lab) - (4 + 98b)
++A       .word  (0 + \lab) - (8 + 98b)
++.else
++T       .word   1 + \lab
++A       .word   \lab
++.endif
++.endm
++
++.macro edge_64b_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_64b_e0     \body_fn, \pb
++10:     edge_64b_e1     \body_fn
++20:     edge_64b_e2     \body_fn, \pb
++30:     edge_64b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_32bx2_e0   \body_fn, \pb
++10:     edge_32bx2_e1   \body_fn
++20:     edge_32bx2_e2   \body_fn, \pb
++30:     edge_32bx2_e3   \body_fn, \pb
++.endm
++
++.macro edge_16b_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++.endm
++
++.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_32bx2_e0   \body_fn_64b, \pb
++10:     edge_32bx2_e1   \body_fn_64b
++20:     edge_32bx2_e2   \body_fn_64b, \pb
++30:     edge_32bx2_e3   \body_fn_64b, \pb
++5:      edge_16b_e0     \body_fn_16b, \pb
++15:     edge_16b_e1     \body_fn_16b
++25:     edge_16b_e2     \body_fn_16b, \pb
++35:     edge_16b_e3     \body_fn_16b, \pb
++.endm
++
++.macro edge_16b_8bx2_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_16b_e0     \body_fn, \pb
++10:     edge_16b_e1     \body_fn
++20:     edge_16b_e2     \body_fn, \pb
++30:     edge_16b_e3     \body_fn, \pb
++5:      edge_8bx2_e0    \body_fn, \pb
++15:     edge_8bx2_e1    \body_fn
++25:     edge_8bx2_e2    \body_fn, \pb
++35:     edge_8bx2_e3    \body_fn, \pb
++.endm
++
++.macro edge_8bx2_4bx4_bodies, body_fn, pb
++        jent    0f
++        jent    10f
++        jent    20f
++        jent    30f
++        jent    5f
++        jent    15f
++        jent    25f
++        jent    35f
++
++0:      edge_8bx2_e0    \body_fn, \pb
++10:     edge_8bx2_e1    \body_fn
++20:     edge_8bx2_e2    \body_fn, \pb
++30:     edge_8bx2_e3    \body_fn, \pb
++5:      edge_4bx4_e0    \body_fn, \pb
++15:     edge_4bx4_e1    \body_fn
++25:     edge_4bx4_e2    \body_fn, \pb
++35:     edge_4bx4_e3    \body_fn, \pb
++.endm
++
++@ void ff_hevc_rpi_sao_edge_8_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_8, export=1
++        edge_16b_init   8, 0, 1, 99f
++99:
++        edge_8bx2_4bx4_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_8, export=1
++        edge_16b_init   8, 0, 0, 99f
++99:
++        edge_16b_bodies edge_16b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_8(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_64_neon_8, export=1
++        edge_64b_init   8, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 1
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
++        edge_16b_init   8, 1, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_8, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_8(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
++        edge_64b_init   8, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_8, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_8_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_8_neon_10, export=1
++        edge_16b_init   10, 0, 1, 99f
++99:
++        edge_16b_8bx2_bodies edge_16b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_16_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_16_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_32bx2_bodies edge_64b_body_16, 2
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_64_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++@ We simply split the 32 case into 2 vertical stripes
++@ and call the fns for w32
++@
++@ Calling code will always have src != dst so we don't have to worry
++@ about edge effects
++
++function ff_hevc_rpi_sao_edge_64_neon_10, export=1
++        edge_64b_init   10, 0, 1, 99f, xjump=1
++endfunc
++
++@ void ff_hevc_rpi_sao_edge_32_neon_10(
++@   uint8_t *_dst,            [r0]
++@   uint8_t *_src,            [r1]
++@   int  stride_dst,          [r2]
++@   int16_t *_sao_offset_val, [r3]
++@   int eo,                   [sp, #0]
++@   int width,                [sp, #4]
++@   int height)               [sp, #8]
++
++function ff_hevc_rpi_sao_edge_32_neon_10, export=1
++        edge_64b_init   10, 0, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 2
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_8_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
++        edge_xxb_init   10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
++99:
++        edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
++endfunc
++
++@ ff_hevc_rpi_sao_edge_c_32_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
++        edge_64b_init   10, 1, 1, 99f, xjump=1
++endfunc
++
++
++@ ff_hevc_rpi_sao_edge_c_16_neon_10(
++@   uint8_t *_dst,                    [r0]
++@   const uint8_t *_src,              [r1]
++@   ptrdiff_t stride_dst,             [r2]
++@   const int16_t *_sao_offset_val_u, [r3]
++@   const int16_t *_sao_offset_val_v, [sp, #0]
++@   int eo,                           [sp, #4]
++@   int width,                        [sp, #8]
++@   int height)                       [sp, #12]
++
++function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
++        edge_64b_init   10, 1, 0, 99f
++99:
++        edge_64b_bodies edge_64b_body_16, 4
++endfunc
++
+diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
+new file mode 100644
+index 0000000000..36a23a5bf9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_arm.h
+@@ -0,0 +1,28 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
++#define AVCODEC_ARM_HEVCPRED_ARM_H
++
++#include "libavcodec/rpi_hevcpred.h"
++
++void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
++
++#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
+new file mode 100644
+index 0000000000..80724d4cf3
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
+@@ -0,0 +1,35 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/cpu.h"
++#include "libavutil/arm/cpu.h"
++
++#include "libavcodec/rpi_hevcpred.h"
++#include "rpi_hevcpred_arm.h"
++
++av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
++{
++    int cpu_flags = av_get_cpu_flags();
++
++    if (have_neon(cpu_flags))
++        ff_hevc_rpi_pred_init_neon(c, bit_depth);
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
+new file mode 100644
+index 0000000000..21e7700174
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
+@@ -0,0 +1,210 @@
++/*
++ * Copyright (c) 2018 John Cox (for Raspberry Pi)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcpred_arm.h"
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
++
++void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
++
++void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
++
++void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
++{
++    switch (bit_depth)
++    {
++    case 8:
++        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
++        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
++        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16;  // Equivalent to c_4_neon_8
++        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++
++        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
++        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
++        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
++        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
++        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
++        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
++        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
++
++        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
++        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
++        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
++        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
++        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
++        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
++        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
++
++        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
++        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
++        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
++        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
++        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
++        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
++        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
++
++        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
++        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
++        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
++        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
++        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
++        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
++        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
++
++        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_8;
++        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_8;
++        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_8;
++        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_8;
++        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
++        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
++        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
++        break;
++    case 10:
++        c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
++        c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
++        c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
++        c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
++        c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
++        c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
++
++        c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
++        c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
++        c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
++        c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
++        c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
++        c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
++        c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
++
++        c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
++        c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
++        c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
++        c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
++        c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
++        c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
++        c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
++
++        c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
++        c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
++        c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
++        c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
++        c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
++        c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
++        c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
++
++        c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
++        c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
++        c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
++        c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
++        c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
++        c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
++        c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
++
++        c->pred_dc[0]   = ff_hevc_rpi_pred_dc_4_neon_10;
++        c->pred_dc[1]   = ff_hevc_rpi_pred_dc_8_neon_10;
++        c->pred_dc[2]   = ff_hevc_rpi_pred_dc_16_neon_10;
++        c->pred_dc[3]   = ff_hevc_rpi_pred_dc_32_neon_10;
++        c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
++        c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
++        c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
++        break;
++    default:
++        break;
++    }
++}
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+new file mode 100644
+index 0000000000..fa8f67cf03
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
+@@ -0,0 +1,2984 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * General angular pred
++ *
++ * Horizontal (10) & Vertical (26) cases have their own file
++ * and are not dealt with properly here (luma filtering is missing)
++ *
++ * The inv_angle calculations are annoying - if it wasn't for the +128
++ * rounding step then the result would simply be the loop counter :-(
++ */
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++.text
++
++@ Horizontal Patch functions
++@ These need a transpose before store so exist as smaller patches
++@ Patches can be called repeatedly without any intermediate setup
++@ to generate a horizontal block
++@
++@ It is almost certainly the case that larger patch fns can be built
++@ and they would be a little faster, but we would still need the small
++@ fns and code size (or at least instruction cache size) is an issue
++@ given how much code we already have here
++
++@ Generate 8x8 luma 8 patch
++@
++@ r3   Out stride
++@ r4   Angle add
++@ r7   Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2   Left ptr - updated
++@ r10  Inv angle accumulator (_up only)
++@ r12  32 - angle frac (_down) or angle frac (_up)
++@ d0   Older reference samples
++@ d1=r8+r9  Newer reference samples
++@ d2   32 - angle frac
++@ d3   Angle frac
++@ q2   Partially computed next result (_up only)
++@
++@ Temps
++@ r5   Loop counter
++@ r6
++@ r7   (_down only)
++@ r11  (_up only)
++@ q2, q8-q11
++
++patch_h_down_8x8_8:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        lsr         r8, #8
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #24
++        ldr         r9, [r2, #5]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_8x8_8_continue:
++        mov         r5, #8
++1:
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++        vext.8      q8, q8, q9, #8
++          itt         mi
++          lsrmi       r7, r8, #8
++          vmovmi      d0, r8, r9
++          vdup.8      d2, r12
++        vext.8      q9, q9, q10, #8
++          it          mi
++          orrmi       r8, r7, r9, lsl #24
++        vext.8      q10, q10, q11, #8
++          it          mi
++          ldrmi       r9, [r2, #1]!
++        vmov        d22, d23
++        vrshrn.u16  d23, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.8      d3, r6
++        bne         1b
++        // drop through...
++store_tran_8x8_8:
++        vzip.8      d16, d17
++        add         r6, r0, r3
++        vzip.8      d18, d19
++        lsl         r3, #1
++        vzip.8      d20, d21
++        add         r5, r0, r3
++        vzip.8      d22, d23
++        vzip.16     q8, q9
++        vzip.16     q10, q11
++        vzip.32     q8, q10
++        vzip.32     q9, q11
++        vst1.8      {d16}, [r0]!
++        vst1.8      {d17}, [r6], r3
++        vst1.8      {d20}, [r5], r3
++        vst1.8      {d21}, [r6], r3
++        vst1.8      {d18}, [r5], r3
++        vst1.8      {d19}, [r6], r3
++        vst1.8      {d22}, [r5]
++        asr         r3, #1
++        vst1.8      {d23}, [r6]
++
++        bx          lr
++
++patch_h_up_8x8_8:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r4
++        lsr         r11, r8, #24
++        vdup.8      d2, r6
++        ldr         r8, [r2, #-1]!
++        orr         r9, r11, r9, lsl #8
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmull.u8    q2, d0, d2
++        vmlal.u8    q2, d1, d3
++patch_h_up_8x8_8_continue:
++        mov         r5, #8
++1:
++          add         r12, r4
++          mov         r11, #0
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++        vext.8      q8, q8, q9, #8
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #8
++          it          cs
++          vmovcs      d0, r8, r9
++          vdup.8      d2, r6
++          it          cs
++          lsrcs       r6, r8, #24
++        vext.8      q9, q9, q10, #8
++          itt         cs
++          orrcs       r9, r6, r9, lsl #8
++          ldrbcs      r11, [r1, r11]
++          vdup.8      d3, r12
++        vext.8      q10, q10, q11, #8
++          it          hi
++          ldrbhi      r11, [r2, #-1]!
++        vmov        d22, d23
++        vrshrn.u16  d23, q2, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #8
++          vmovcs      d1, r8, r9
++          vmull.u8    q2, d0, d2
++        subs        r5, #1
++          vmlal.u8    q2, d1, d3
++        bne         1b
++
++        b           store_tran_8x8_8
++
++
++.macro ADRT reg, val
++@ adr in T32 has enough range but not in A32
++A       adrl        \reg, \val
++T       adr         \reg, \val
++.endm
++
++@ ff_hevc_rpi_pred_angular_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        ldr         lr, [r2], #1        @ Top
++        rsb         r12, r6, #32
++        vmov        s0, lr
++        vdup.8      d3, r6
++        ldr         lr, [r2], #1
++        vdup.8      d2, r12
++        vmov        s2, lr
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      s0, lr
++          ldrmi       lr, [r2], #1
++          vdup.8      d2, r12
++          it          mi
++          vmovmi      s2, lr
++          vdup.8      d3, r6
++        mov         r5, #2
++1:
++        vrshrn.u16  d20, q2, #5
++            subs        r12, r4
++          vmull.u8    q2, d0, d2
++            it          mi
++            addmi       r12, #32
++          vmlal.u8    q2, d1, d3
++            rsb         r6, r12, #32
++        vext.64     q8, q8, q9, #1
++            it          mi
++            vmovmi      s0, lr
++        vext.64     q9, q9, q10, #1
++            it          mi
++            ldrmi       lr, [r2], #1
++            vdup.8      d2, r12
++            it          mi
++            vmovmi      s2, lr
++        subs        r5, #1
++            vdup.8      d3, r6
++        bne         1b
++
++          vrshrn.u16  d20, q2, #5
++            vmull.u8    q2, d0, d2
++        add         r12, r0,  r3
++            vmlal.u8    q2, d1, d3
++        lsl         r3,  #1
++          vext.64     q8, q8, q9, #1
++          vext.64     q9, q9, q10, #1
++            vrshrn.u16  d20, q2, #5
++
++98:
++        vst4.8      {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
++        vst4.8      {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
++        vst4.8      {d17[2], d18[2], d19[2], d20[2]}, [r0]
++        vst4.8      {d17[3], d18[3], d19[3], d20[3]}, [r12]
++        pop        {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        rsb         r12, r6, #32
++        ldr         lr, [r2]            @ Left
++        ldrb        r2, [r2, #-1]       @ Top-left
++        vmov        s0, lr
++        vdup.8      d2, r12
++        vdup.8      d3, r6
++        orr         lr, r2, lr, lsl #8
++        vmov        s2, lr
++        sub         r8, r7, #128
++        mov         r5, #3
++2:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++T         it          mi
++          addmi       r12, #32
++T         asr         r6, r8, #8
++T         it          mi
++T         ldrbmi      r2, [r1, r6]
++A         ldrbmi      r2, [r1, r8, asr #8]
++          rsb         r6, r12, #32
++          vdup.8      d2, r12
++          ittt        mi
++          vmovmi      s0, lr
++          orrmi       lr, r2, lr, lsl #8
++          vmovmi      s2, lr
++        vrshrn.u16  d20, q2, #5
++          vdup.8      d3, r6
++          it          mi
++          addmi       r8, r7
++        subs        r5, #1
++        vext.64     q8, q8, q9, #1
++        vext.64     q9, q9, q10, #1
++        bne         2b
++
++          vmull.u8    q2, d0, d2
++        add         r12, r0,  r3
++          vmlal.u8    q2, d1, d3
++        lsl         r3,  #1
++          vrshrn.u16  d20, q2, #5
++        b           98b
++
++@ Left of vertical - works down left
++18:
++        ldrh        r7, [r7]
++        rsb         r12, r6, #32
++        ldr         lr, [r1]            @ Top
++        ldrb        r1, [r2, #-1]       @ Top-left
++        vmov        s0, lr
++        vdup.8      d2, r12
++        vdup.8      d3, r6
++        orr         lr, r1, lr, lsl #8
++        vmov        s2, lr
++        sub         r8, r7, #128
++        mov         r5, #3
++2:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++T         it          mi
++          addmi       r12, #32
++T         asr         r6, r8, #8
++T         it          mi
++T         ldrbmi      r1, [r2, r6]
++A         ldrbmi      r1, [r2, r8, asr #8]
++          rsb         r6, r12, #32
++          vdup.8      d2, r12
++          ittt        mi
++          vmovmi      s0, lr
++          orrmi       lr, r1, lr, lsl #8
++          vmovmi      s2, lr
++        vrshrn.u16  d4, q2, #5
++          vdup.8      d3, r6
++          it          mi
++          addmi       r8, r7
++        subs        r5, #1
++        vst1.32     {d4[0]}, [r0], r3
++        bne         2b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.32     {d4[0]}, [r0]
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldr         lr, [r1], #1        @ Top
++        rsb         r12, r6, #32
++        vmov        s0, lr
++        vdup.8      d3, r6
++        ldr         lr, [r1], #1
++        vdup.8      d2, r12
++        vmov        s2, lr
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      s0, lr
++          ldrmi       lr, [r1], #1
++          vdup.8      d2, r12
++          it          mi
++          vmovmi      s2, lr
++          vdup.8      d3, r6
++        mov         r5, #2
++1:
++        vrshrn.u16  d6, q2, #5
++            subs        r12, r4
++          vmull.u8    q2, d0, d2
++            it          mi
++            addmi       r12, #32
++          vmlal.u8    q2, d1, d3
++            rsb         r6, r12, #32
++        vst1.32     {d6[0]}, [r0], r3
++            itt         mi
++            vmovmi      s0, lr
++            ldrmi       lr, [r1], #1
++            vdup.8      d2, r12
++            it          mi
++            vmovmi      s2, lr
++        subs        r5, #1
++            vdup.8      d3, r6
++        bne         1b
++
++          vrshrn.u16  d6, q2, #5
++            vmull.u8    q2, d0, d2
++            vmlal.u8    q2, d1, d3
++          vst1.32     {d6[0]}, [r0], r3
++            vrshrn.u16  d6, q2, #5
++            vst1.32     {d6[0]}, [r0]
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_8x8_8
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_8x8_8
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrb        lr, [r2, #-1]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #8
++        vdup.8      d2, r12
++        orr         r9, r9, r8, lsr #24
++        orr         r8, lr, r8, lsl #8
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #7
++1:
++        vdup.8      d3, r6
++        vmull.u8    q2, d0, d2
++          subs        r12, r12, r4
++        vmlal.u8    q2, d1, d3
++          ittt        mi
++          addmi       lr, r2, r1, asr #8
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #8
++          ldrbmi      lr, [lr]
++          vdup.8      d2, r12
++        vrshrn.u16  d4, q2, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #24
++          orrmi       r8, lr, r8, lsl #8
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.8      {d4}, [r0], r3
++        bne         1b
++
++          vdup.8      d3, r6
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.8      {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        mov         r5, #7
++        lsr         r8, #8
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #24
++        ldr         r9, [r1, #5]!
++        vmov        d1, r8, r9
++1:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #8
++          vdup.8      d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #24
++          ldrmi       r9, [r1, #1]!
++        vrshrn.u16  d6, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.8      d3, r6
++        subs        r5, #1
++        vst1.8      {d6}, [r0], r3
++        bne         1b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d6, q2, #5
++          vst1.8      {d6}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++
++        add         r2, r1, #8          @ restore r2, but 8 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #3
++
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #3
++        sub         r10, r10, r7, lsl #3
++
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q9}, [r1]
++        sub         r1, r2, #1
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.8      d6, r6
++        vext.8      q8, q9, q9, #15
++        sub         r8, r7, #128
++        vld1.8      {d16[0]}, [r1]
++        vdup.8      d7, r12
++        mov         r5, #15
++1:
++        vmull.u8    q0, d18, d7
++        subs        r12, r4
++        vmlal.u8    q0, d16, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d19, d7
++        it          cc
++        addcc       r1, r2, r8, asr #8
++        vmlal.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vext.8      q10, q8, q8, #15
++        sub         r5, #1
++        vld1.8      {d20[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q11, q8
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d22, d7
++        subs        r12, r4
++        vmlal.u8    q0, d20, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d23, d7
++        it          cc
++        addcc       r1, r2, r8, asr #8
++        vmlal.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vext.8      q8, q10, q10, #15
++        sub         r5, #1
++        vld1.8      {d16[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q9, q10
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d22, d7
++        vmlal.u8    q0, d20, d6
++        vmull.u8    q1, d23, d7
++        vmlal.u8    q1, d21, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d18, d7
++        vmlal.u8    q0, d16, d6
++        vmull.u8    q1, d19, d7
++        vmlal.u8    q1, d17, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.8      {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vext.8      q8, q9, q9, #1
++        vld1.8      {d17[7]}, [r1]!
++        mov         r5, #15
++1:
++        vmull.u8    q0, d16, d6
++        subs        r12, r4
++        vmlal.u8    q0, d18, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d19, d7
++        sub         r5, #1
++        vext.8      q10, q8, q8, #1
++        teq         r5, #0
++        vld1.8      {d21[7]}, [r1]
++        it          cc
++        addcc       r1, #1
++        vmov        q11, q8
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d20, d6
++        subs        r12, r4
++        vmlal.u8    q0, d22, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d23, d7
++        sub         r5, #1
++        vext.8      q8, q10, q10, #1
++        teq         r5, #0
++        vld1.8      {d17[7]}, [r1]
++        it          cc
++        addcc       r1, #1
++        vmov        q9, q10
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d20, d6
++        vmlal.u8    q0, d22, d7
++        vmull.u8    q1, d21, d6
++        vmlal.u8    q1, d23, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d16, d6
++        vmlal.u8    q0, d18, d7
++        vmull.u8    q1, d17, d6
++        vmlal.u8    q1, d19, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_8x8_8
++        bl          patch_h_down_8x8_8_continue
++        bl          patch_h_down_8x8_8_continue
++        bl          patch_h_down_8x8_8_continue
++
++        add         r2, r1, #8          @ restore r2, but 8 rows further down left
++        add         r1, r1, #8
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #3
++        bne         1b
++
++        pop        {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2,r10}
++        bl          patch_h_up_8x8_8
++        bl          patch_h_up_8x8_8_continue
++        bl          patch_h_up_8x8_8_continue
++        bl          patch_h_up_8x8_8_continue
++        pop         {r2,r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #3
++        sub         r10, r10, r7, lsl #3
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop        {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q0-q1}, [r1]
++        sub         r9, r2, #1
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        mov         r5, #32
++1:
++        vld1.8      {d17[7]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        add         r9, r2, r8, asr #8
++        vext.8      q1, q0, q1, #15
++        vext.8      q0, q8, q0, #15
++2:
++        vmull.u8    q10, d4, d19
++        subs        r12, r4
++        vmlal.u8    q10, d0, d18
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d5, d19
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d1, d18
++        sub         r5, #1
++        vmull.u8    q12, d6, d19
++        teq         r5, #0
++        vmlal.u8    q12, d2, d18
++        vmull.u8    q13, d7, d19
++        vmlal.u8    q13, d3, d18
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.8      {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.8      {d16[0]}, [r5]
++        mov         r5, #32
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++1:
++        vmov        q2, q0
++        add         r1, #1
++        vmov        q3, q1
++        vext.8      q0, q0, q1, #1
++        vext.8      q1, q1, q8, #1
++2:
++        vmull.u8    q10, d0, d18
++        subs        r12, r4
++        vmlal.u8    q10, d4, d19
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d1, d18
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d5, d19
++        sub         r5, #1
++        vmull.u8    q12, d2, d18
++        teq         r5, #0
++        vmlal.u8    q12, d6, d19
++        vmull.u8    q13, d3, d18
++        vmlal.u8    q13, d7, d19
++        vld1.8      {d16[0]}, [r1]
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ Chroma 8 bit 4x4 patch fns
++        .text
++
++patch_h_down_c_4x4_8:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        lsr         r8, #16
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r2, #6]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_c_4x4_8_continue:
++        mov         r5, #4
++1:
++          subs        r12, r4
++        vmull.u8    q2, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmlal.u8    q2, d1, d3
++          rsb         r6, r12, #32
++        vext.8      q8, q8, q9, #8
++          it          mi
++          lsrmi       r7, r8, #16
++        vmov        d18, d19
++          it          mi
++          vmovmi      d0, r8, r9
++          vdup.8      d2, r12
++          it          mi
++          orrmi       r8, r7, r9, lsl #16
++        vrshrn.u16  d19, q2, #5
++          itt         mi
++          ldrmi       r9, [r2, #2]!
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.8      d3, r6
++        bne         1b
++        // drop through...
++store_tran_c_4x4_8:
++        vzip.16     d16, d17
++        add         r6, r0, r3
++        vzip.16     d18, d19
++        lsl         r3, #1
++        vzip.32     q8, q9
++        add         r5, r0, r3
++        vst1.16     {d16}, [r0]!
++        vst1.16     {d17}, [r6], r3
++        vst1.16     {d18}, [r5]
++        asr         r3, #1
++        vst1.16     {d19}, [r6]
++
++        bx          lr
++
++patch_h_up_c_4x4_8:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r4
++        lsr         r11, r8, #16
++        vdup.8      d2, r6
++        ldr         r8, [r2, #-2]!
++        orr         r9, r11, r9, lsl #16
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmull.u8    q2, d0, d2
++        vmlal.u8    q2, d1, d3
++patch_h_up_c_4x4_8_continue:
++        mov         r5, #4
++1:
++          add         r12, r4
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++          mov         r11, #0
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #7
++          it          cs
++          vmovcs      d0, r8, r9
++          it          eq
++          biceq       r11, #1
++          vdup.8      d2, r6
++          it          cs
++          lsrcs       r6, r8, #16
++          vdup.8      d3, r12
++        vext.8      q8, q8, q9, #8
++          itt         cs
++          orrcs       r9, r6, r9, lsl #16
++          ldrhcs      r11, [r1, r11]
++        vmov        d18, d19
++          it          hi
++          ldrhhi      r11, [r2, #-2]!
++        vrshrn.u16  d19, q2, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #16
++          vmovcs      d1, r8, r9
++          vmull.u8    q2, d0, d2
++        subs        r5, #1
++          vmlal.u8    q2, d1, d3
++        bne         1b
++
++        b           store_tran_c_4x4_8
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_c_4x4_8
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_c_4x4_8
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrh        lr, [r2, #-2]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #16
++        vdup.8      d2, r12
++        orr         r9, r9, r8, lsr #16
++        orr         r8, lr, r8, lsl #16
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #3
++1:
++        vdup.8      d3, r6
++        vmull.u8    q2, d0, d2
++          subs        r12, r12, r4
++        vmlal.u8    q2, d1, d3
++          itttt       mi
++          addmi       lr, r2, r1, asr #7
++          bicmi       lr, #1
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #16
++          ldrhmi      lr, [lr]
++          vdup.8      d2, r12
++        vrshrn.u16  d4, q2, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #16
++          orrmi       r8, lr, r8, lsl #16
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vdup.8      d3, r6
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d4, q2, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.8      d3, r6
++        mov         r5, #3
++        lsr         r8, #16
++        vdup.8      d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r1, #6]!
++        vmov        d1, r8, r9
++1:
++        vmull.u8    q2, d0, d2
++          subs        r12, r4
++        vmlal.u8    q2, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #16
++          vdup.8      d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #16
++          ldrmi       r9, [r1, #2]!
++        vrshrn.u16  d6, q2, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.8      d3, r6
++        subs        r5, #1
++        vst1.16     {d6}, [r0], r3
++        bne         1b
++
++          vmull.u8    q2, d0, d2
++          vmlal.u8    q2, d1, d3
++          vrshrn.u16  d6, q2, #5
++          vst1.16     {d6}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++
++        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q9}, [r1]
++        sub         r1, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.8      d6, r6
++        vext.8      q8, q9, q9, #14
++        sub         r8, r7, #128
++        vld1.16     {d16[0]}, [r1]
++        vdup.8      d7, r12
++        mov         r5, #7
++1:
++        subs        r12, r4
++        vmull.u8    q0, d18, d7
++        it          cc
++        asrcc       r1, r8, #8
++        vmlal.u8    q0, d16, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d19, d7
++        it          cc
++        addcc       r1, r2, r1, lsl #1
++        vmlal.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vext.8      q10, q8, q8, #14
++        sub         r5, #1
++        vld1.16     {d20[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q11, q8
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        subs        r12, r4
++        vmull.u8    q0, d22, d7
++        it          cc
++        asrcc       r1, r8, #8
++        vmlal.u8    q0, d20, d6
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d23, d7
++        it          cc
++        addcc       r1, r2, r1, lsl #1
++        vmlal.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vext.8      q8, q10, q10, #14
++        sub         r5, #1
++        vld1.16     {d16[0]}, [r1]
++        it          cc
++        addcc       r8, r7
++        vmov        q9, q10
++        teq         r5, #0
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d22, d7
++        vmlal.u8    q0, d20, d6
++        vmull.u8    q1, d23, d7
++        vmlal.u8    q1, d21, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d18, d7
++        vmlal.u8    q0, d16, d6
++        vmull.u8    q1, d19, d7
++        vmlal.u8    q1, d17, d6
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.8      {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vext.8      q8, q9, q9, #2
++        vld1.16     {d17[3]}, [r1]!
++        mov         r5, #7
++1:
++        vmull.u8    q0, d16, d6
++        subs        r12, r4
++        vmlal.u8    q0, d18, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d17, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d19, d7
++        sub         r5, #1
++        vext.8      q10, q8, q8, #2
++        teq         r5, #0
++        vld1.16     {d21[3]}, [r1]
++        it          cc
++        addcc       r1, #2
++        vmov        q11, q8
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmull.u8    q0, d20, d6
++        subs        r12, r4
++        vmlal.u8    q0, d22, d7
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q1, d21, d6
++        rsb         r6, r12, #32
++        vmlal.u8    q1, d23, d7
++        sub         r5, #1
++        vext.8      q8, q10, q10, #2
++        teq         r5, #0
++        vld1.16     {d17[3]}, [r1]
++        it          cc
++        addcc       r1, #2
++        vmov        q9, q10
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vdup.8      d6, r6
++        vdup.8      d7, r12
++        vst1.8      {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmull.u8    q0, d20, d6
++        vmlal.u8    q0, d22, d7
++        vmull.u8    q1, d21, d6
++        vmlal.u8    q1, d23, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmull.u8    q0, d16, d6
++        vmlal.u8    q0, d18, d7
++        vmull.u8    q1, d17, d6
++        vmlal.u8    q1, d19, d7
++        vrshrn.u16  d0, q0, #5
++        vrshrn.u16  d1, q1, #5
++        vst1.8      {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_c_4x4_8
++        bl          patch_h_down_c_4x4_8_continue
++        bl          patch_h_down_c_4x4_8_continue
++        bl          patch_h_down_c_4x4_8_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2, r10}
++        bl          patch_h_up_c_4x4_8
++        bl          patch_h_up_c_4x4_8_continue
++        bl          patch_h_up_c_4x4_8_continue
++        bl          patch_h_up_c_4x4_8_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.8      {q0-q1}, [r1]
++        sub         r9, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        mov         r5, #16
++1:
++        vld1.16     {d17[3]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.8      q1, q0, q1, #14
++        add         r9, r2, r9, lsl #1
++        vext.8      q0, q8, q0, #14
++2:
++        vmull.u8    q10, d4, d19
++        subs        r12, r4
++        vmlal.u8    q10, d0, d18
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d5, d19
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d1, d18
++        sub         r5, #1
++        vmull.u8    q12, d6, d19
++        teq         r5, #0
++        vmlal.u8    q12, d2, d18
++        vmull.u8    q13, d7, d19
++        vmlal.u8    q13, d3, d18
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.8      {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.16     {d16[0]}, [r5]
++        mov         r5, #16
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++1:
++        vmov        q2, q0
++        add         r1, #2
++        vmov        q3, q1
++        vext.8      q0, q0, q1, #2
++        vext.8      q1, q1, q8, #2
++2:
++        vmull.u8    q10, d0, d18
++        subs        r12, r4
++        vmlal.u8    q10, d4, d19
++        it          cc
++        addcc       r12, #32
++        vmull.u8    q11, d1, d18
++        rsb         r6, r12, #32
++        vmlal.u8    q11, d5, d19
++        sub         r5, #1
++        vmull.u8    q12, d2, d18
++        teq         r5, #0
++        vmlal.u8    q12, d6, d19
++        vmull.u8    q13, d3, d18
++        vmlal.u8    q13, d7, d19
++        vld1.16     {d16[0]}, [r1]
++        vdup.8      d18, r6
++        vdup.8      d19, r12
++        vrshrn.u16  d20, q10, #5
++        vrshrn.u16  d21, q11, #5
++        vrshrn.u16  d22, q12, #5
++        vrshrn.u16  d23, q13, #5
++        vst1.8      {q10-q11}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++@------------------------------------------------------------------------------
++@ Data
++
++        .text
++        .balign  64
++angle_2:
++        .byte    32
++        .byte    26,  21,  17,  13,   9,   5,   2,   0
++        @ Sign inverted from standards table
++        .byte     2,   5,   9,  13,  17,  21,  26,  32
++        .byte    26,  21,  17,  13,   9,   5,   2,   0
++        @ Standard sign
++        .byte     2,   5,   9,  13,  17,  21,  26,  32
++
++        .balign   2
++
++        @ Sign inverted from standards table
++inv_angle:
++        .short   4096, 1638,  910,  630,  482,  390,  315
++        .short    256
++        .short    315,  390,  482,  630,  910, 1638, 4096
++
++@------------------------------------------------------------------------------
++@
++@ 10 bit fns
++@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
++@ but runs out of register width for 12+ bit
++
++        .text
++        .balign 64
++
++patch_h_down_4x4_10:
++        ldrd        r8, r9, [r2]        @ Left
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r6
++        lsr         r8, #16
++        vdup.16     d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r2, #6]!
++        vmov        d1, r8, r9
++        // drop through...
++patch_h_down_4x4_10_continue:
++        mov         r5, #4
++1:
++          subs        r12, r4
++        vmul.u16    d4, d0, d2
++          it          mi
++          addmi       r12, #32
++        vmla.u16    d4, d1, d3
++          rsb         r6, r12, #32
++        vext.16     q8, q8, q9, #4
++          it          mi
++          lsrmi       r7, r8, #16
++        vmov        d18, d19
++          it          mi
++          vmovmi      d0, r8, r9
++          vdup.16     d2, r12
++          it          mi
++          orrmi       r8, r7, r9, lsl #16
++        vrshr.u16   d19, d4, #5
++          itt         mi
++          ldrmi       r9, [r2, #2]!
++          vmovmi      d1, r8, r9
++        subs        r5, #1
++          vdup.16     d3, r6
++        bne         1b
++        // drop through...
++store_tran_4x4_10:
++        vzip.16     d16, d17
++        add         r6, r0, r3
++        vzip.16     d18, d19
++        lsl         r3, #1
++        vzip.32     q8, q9
++        add         r5, r0, r3
++        vst1.16     {d16}, [r0]!
++        vst1.16     {d17}, [r6], r3
++        vst1.16     {d18}, [r5]
++        asr         r3, #1
++        vst1.16     {d19}, [r6]
++
++        bx          lr
++
++patch_h_up_4x4_10:
++        ldrd        r8, r9, [r2]
++        rsb         r6, r4, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r4
++        lsr         r11, r8, #16
++        vdup.16     d2, r6
++        ldr         r8, [r2, #-2]!
++        orr         r9, r11, r9, lsl #16
++        vmov        d1, r8, r9
++        mov         r12, r4
++        vmul.u16    d4, d0, d2
++        vmla.u16    d4, d1, d3
++patch_h_up_4x4_10_continue:
++        mov         r5, #4
++1:
++          add         r12, r4
++          cmp         r12, #33
++          it          cs
++          addcs       r10, r7
++          mov         r11, #0
++          itt         cs
++          subcs       r12, #32
++          tstcs       r10, #1<<31
++          rsb         r6, r12, #32
++          it          eq
++          asreq       r11, r10, #7
++          it          cs
++          vmovcs      d0, r8, r9
++          it          eq
++          biceq       r11, #1
++          vdup.16     d2, r6
++          it          cs
++          lsrcs       r6, r8, #16
++          vdup.16     d3, r12
++        vext.16     q8, q8, q9, #4
++          itt         cs
++          orrcs       r9, r6, r9, lsl #16
++          ldrhcs      r11, [r1, r11]
++        vmov        d18, d19
++          it          hi
++          ldrhhi      r11, [r2, #-2]!
++        vrshr.u16   d19, d4, #5
++          itt         cs
++          orrcs       r8, r11, r8, lsl #16
++          vmovcs      d1, r8, r9
++          vmul.u16    d4, d0, d2
++        subs        r5, #1
++          vmla.u16    d4, d1, d3
++        bne         1b
++
++        b           store_tran_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_4_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_4x4_10
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        bl          patch_h_up_4x4_10
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        ldrh        lr, [r2, #-2]       @ Top-left
++        ldrh        r7, [r7]
++        vmov        d0, r8, r9
++        lsl         r9, r9, #16
++        vdup.16     d2, r12
++        orr         r9, r9, r8, lsr #16
++        orr         r8, lr, r8, lsl #16
++        vmov        d1, r8, r9
++        sub         r1, r7, #128
++        mov         r5, #3
++1:
++        sel         lr, lr, lr          @ force pipeline 0 on Cortex-A53
++        vdup.16     d3, r6
++        vmul.u16    d4, d0, d2
++          subs        r12, r12, r4
++        vmla.u16    d4, d1, d3
++          itttt       mi
++          addmi       lr, r2, r1, asr #7
++          bicmi       lr, #1
++          addmi       r12, r12, #32
++          vmovmi      d0, r8, r9
++          rsb         r6, r12, #32
++          itt         mi
++          lslmi       r9, r9, #16
++          ldrhmi      lr, [lr]
++          vdup.16     d2, r12
++        vrshr.u16   d4, d4, #5
++          itttt       mi
++          orrmi       r9, r9, r8, lsr #16
++          orrmi       r8, lr, r8, lsl #16
++          vmovmi      d1, r8, r9
++          addmi       r1, r1, r7
++        subs        r5, r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vdup.16     d3, r6
++          nop                           @ force next insn into pipeline 0 to enable
++          vmul.u16    d4, d0, d2        @ vmla to execute back-to-back on Cortex-A53
++          vmla.u16    d4, d1, d3
++          vrshr.u16   d4, d4, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        ldrd        r8, r9, [r1]        @ Top
++        rsb         r12, r6, #32
++        vmov        d0, r8, r9
++        vdup.16     d3, r6
++        lsr         r8, #16
++        vdup.16     d2, r12
++        orr         r8, r8, r9, lsl #16
++        ldr         r9, [r1, #6]!
++        vmov        d1, r8, r9
++        mov         r5, #3
++1:
++        vmul.u16    d4, d0, d2
++          subs        r12, r4
++        vmla.u16    d4, d1, d3
++          it          mi
++          addmi       r12, #32
++          rsb         r6, r12, #32
++          itt         mi
++          vmovmi      d0, r8, r9
++          lsrmi       r8, #16
++          vdup.16     d2, r12
++          itt         mi
++          orrmi       r8, r8, r9, lsl #16
++          ldrmi       r9, [r1, #2]!
++        vrshr.u16   d4, d4, #5
++          it          mi
++          vmovmi      d1, r8, r9
++          vdup.16     d3, r6
++        subs        r5, #1
++        vst1.16     {d4}, [r0], r3
++        bne         1b
++
++          vmul.u16    d4, d0, d2
++          vmla.u16    d4, d1, d3
++          vrshr.u16   d4, d4, #5
++          vst1.16     {d4}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_8_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2        @ restore r2, but 4 rows further down left
++        sub         r0, #16
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++
++        push        {r2}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2}
++
++        sub         r0, #16
++        mov         r10, #-128
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q9}, [r1]
++        sub         r1, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.16     q2, r6
++        vext.16     q8, q9, q9, #7
++        sub         r8, r7, #128
++        vld1.16     {d16[0]}, [r1]
++        vdup.16     q3, r12
++        mov         r5, #7
++1:
++        vmul.u16    q0, q9, q3
++        subs        r12, r4
++        vmla.u16    q0, q8, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #1
++        vext.16     q10, q8, q8, #7
++        rsb         r6, r12, #32
++        vmov        q11, q8
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.16     {d20[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q11, q3
++        subs        r12, r4
++        vmla.u16    q0, q10, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #1
++        vext.16     q8, q10, q10, #7
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.16     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q11, q3
++        vmla.u16    q0, q10, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q9, q3
++        vmla.u16    q0, q8, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.16     {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vext.16     q8, q9, q9, #1
++        vld1.16     {d17[3]}, [r1]!
++        mov         r5, #7
++1:
++        vmul.u16    q0, q8, q2
++        subs        r12, r4
++        vmla.u16    q0, q9, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q10, q8, q8, #1
++        rsb         r6, r12, #32
++        vld1.16     {d21[3]}, [r1]
++        sub         r5, #1
++        vmov        q11, q8
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #2
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q10, q2
++        subs        r12, r4
++        vmla.u16    q0, q11, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q8, q10, q10, #1
++        rsb         r6, r12, #32
++        vld1.16     {d17[3]}, [r1]
++        sub         r5, #1
++        vmov        q9, q10
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #2
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q10, q2
++        vmla.u16    q0, q11, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q8, q2
++        vmla.u16    q0, q9, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_16_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #32
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<2
++1:
++        push        {r2, r10}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #32
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q0-q1}, [r1]
++        sub         r9, r2, #2
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        mov         r5, #16
++1:
++        vld1.16     {d17[3]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.16     q1, q0, q1, #7
++        add         r9, r2, r9, lsl #1
++        vext.16     q0, q8, q0, #7
++2:
++        vmul.u16    q11, q2, q10
++        subs        r12, r4
++        vmla.u16    q11, q0, q9
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q3, q10
++        rsb         r6, r12, #32
++        vmla.u16    q12, q1, q9
++        sub         r5, #1
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.16     {d16[0]}, [r5]
++        mov         r5, #16
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++1:
++        vmov        q2, q0
++        add         r1, #2
++        vmov        q3, q1
++        vext.16     q0, q0, q1, #1
++        vext.16     q1, q1, q8, #1
++2:
++        vmul.u16    q11, q0, q9
++        subs        r12, r4
++        vmla.u16    q11, q2, q10
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q1, q9
++        rsb         r6, r12, #32
++        vmla.u16    q12, q3, q10
++        sub         r5, #1
++        vld1.16     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_32_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r11, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #1
++        vpush       {d8}
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        add         sp, #8
++        mov         r10, #8
++        mov         r1, r2
++1:
++        bl          patch_h_down_4x4_10
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++        bl          patch_h_down_4x4_10_continue
++
++        add         r2, r1, #4*2         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*2
++        mov         r6, r4
++        sub         r0, #64
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Up of Horizontal - works down up
++10:
++        add         sp, #8
++        ldrh        r7, [r7]
++        mov         r10, #-128
++        vmov.i8     d6, #1<<6
++1:
++        push        {r2, r10}
++        bl          patch_h_up_4x4_10
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        bl          patch_h_up_4x4_10_continue
++        pop         {r2, r10}
++
++        vmov        r8, s12
++        sub         r0, #64
++        add         r2, #8
++        add         r0, r0, r3, lsl #2
++        sub         r10, r10, r7, lsl #2
++        vshr.u8     d6, #1
++        teq         r8, #0
++        bne         1b
++
++        pop         {r4-r11, pc}
++
++@ Left of vertical - works down left
++18:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        sub         r9, r2, #2
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vmov        d0, d9
++        vmov        s2, r12
++        add         r10, r0, #32
++        mov         r5, #32
++1:
++        vld1.16     {d1[3]}, [r9]
++        add         r8, r7
++        vmov        q11, q4
++        vmov        q10, q3
++        asr         r9, r8, #8
++        vmov        q9, q2
++        add         r9, r2, r9, lsl #1
++        vmov        q8, q1
++        vext.16     q4, q3, q4, #7
++        vext.16     q3, q2, q3, #7
++        vext.16     q2, q1, q2, #7
++        vext.16     q1, q0, q1, #7
++2:
++        vmul.u16    q12, q8, d1[1]
++        adds        r12, r4
++        vmla.u16    q12, q1, d1[0]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q9, d1[1]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q2, d1[0]
++        sub         r5, #1
++        vmul.u16    q14, q10, d1[1]
++        teq         r5, #0
++        vmla.u16    q14, q3, d1[0]
++        vmul.u16    q15, q11, d1[1]
++        vmla.u16    q15, q4, d1[0]
++        vmov        s2, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d0
++        pop         {r4-r11, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        add         r1, r1, #64
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        vmov        d1, d9
++        vmov        s1, r12
++        add         r10, r0, #32
++        mov         r5, #32
++1:
++        vld1.16     {d0[0]}, [r1]!
++        vmov        q8, q1
++        vmov        q9, q2
++        vmov        q10, q3
++        vmov        q11, q4
++        vext.16     q1, q1, q2, #1
++        vext.16     q2, q2, q3, #1
++        vext.16     q3, q3, q4, #1
++        vext.16     q4, q4, q0, #1
++2:
++        vmul.u16    q12, q1, d0[2]
++        adds        r12, r4
++        vmla.u16    q12, q8, d0[3]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q2, d0[2]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q9, d0[3]
++        sub         r5, #1
++        vmul.u16    q14, q3, d0[2]
++        teq         r5, #0
++        vmla.u16    q14, q10, d0[3]
++        vmul.u16    q15, q4, d0[2]
++        vmla.u16    q15, q11, d0[3]
++        vmov        s1, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d1
++        pop         {r4-r11, pc}
++
++endfunc
++
++
++
++@ Generate 4x4 chroma patch
++@
++@ In (const)
++@ r1   Up ptr (_up only)
++@ r3   Out stride
++@ r4   Angle add
++@ r7   Inv angle (_up only)
++@
++@ In/Out (updated)
++@ r0   Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
++@ r2   Left ptr - updated
++@ r6   Angle frac (init to r4 + 32)
++@ r8   Inv angle accumulator
++@ q2   Cur Line - load before 1st call for down - set by _up
++@ q8   Cur Line - load before 1st call for up   - set by _down
++@
++@ Temps
++@ r5   Loop counter
++@ r12
++@ d0, q1, q12-q15
++
++patch_h_down_c_4x4_10:
++        vld1.16     {q12}, [r2]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        mov         r5, #4
++1:
++        vmov        q13, q12
++        vext.16     q12, q12, q12, #2
++        vld1.32     {d25[1]}, [r2]!
++patch_h_down_c_4x4_10_continue:
++2:
++        vmov        q8, q9
++        subs        r12, r4
++        vmul.u16    q0, q13, q3
++        it          cc
++        addcc       r12, #32
++        vmla.u16    q0, q12, q2
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vmov        q10, q11
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vrshr.u16   q11, q0, #5
++        bhi         2b
++        bne         1b
++
++        bcs         3f
++        vmov        q13, q12
++        vext.16     q12, q12, q12, #2
++        vld1.32     {d25[1]}, [r2]!
++3:
++
++store_tran_c_4x4_10:
++T       add         r6, r0, r3
++        vzip.32     q8, q10
++A       add         r6, r0, r3
++T       lsl         r3, #1
++        vzip.32     q9, q11
++A       add         r5, r0, r3, lsl #1
++T       add         r5, r0, r3
++        vst2.32     {d16,d18}, [r0]!
++A       lsl         r3, #1
++        vst2.32     {d17,d19}, [r6], r3
++        asr         r3, #1
++        vst2.32     {d20,d22}, [r5]
++        mov         r5, #4
++        vst2.32     {d21,d23}, [r6]
++        bx          lr
++
++patch_h_up_c_4x4_10:
++        vld1.16     {q1}, [r2]
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        mov         r5, #4
++1:
++        adds        r8, r7
++        vmov        q12, q1
++        it          mi
++        ldrmi       r6, [r2, #-4]!
++        vext.16     q1, q1, q1, #6
++        itt         pl
++        asrpl       r6, r8, #8
++        ldrpl       r6, [r1, r6, lsl #2]
++        vmov        s4, r6
++patch_h_up_c_4x4_10_continue:
++2:
++        vmov        q8, q9
++        subs        r12, r4
++        vmul.u16    q0, q12, q3
++        it          cc
++        addcc       r12, #32
++        vmla.u16    q0, q1, q2
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vmov        q10, q11
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vrshr.u16   q11, q0, #5
++        bhi         2b
++        bne         1b
++
++        bcs         store_tran_c_4x4_10
++        adds        r8, r7
++        vmov        q12, q1
++        it          mi
++        ldrmi       r6, [r2, #-4]!
++        vext.16     q1, q1, q1, #6
++        itt         pl
++        asrpl       r6, r8, #8
++        ldrpl       r6, [r1, r6, lsl #2]
++        vmov        s4, r6
++        b           store_tran_c_4x4_10
++
++
++@ ff_hevc_rpi_pred_angular_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        bl          patch_h_down_c_4x4_10
++        pop         {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++        bl          patch_h_up_c_4x4_10
++        pop         {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q9}, [r1]
++        sub         r1, r2, #4
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        vdup.16     q2, r6
++        vext.16     q8, q9, q9, #6
++        sub         r8, r7, #128
++        vld1.32     {d16[0]}, [r1]
++        vdup.16     q3, r12
++        mov         r5, #3
++1:
++        vmul.u16    q0, q9, q3
++        subs        r12, r4
++        vmla.u16    q0, q8, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #2
++        vext.16     q10, q8, q8, #6
++        rsb         r6, r12, #32
++        vmov        q11, q8
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.32     {d20[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q11, q3
++        subs        r12, r4
++        vmla.u16    q0, q10, q2
++        ittt        cc
++        asrcc       r1, r8, #8
++        addcc       r12, #32
++        addcc       r1, r2, r1, lsl #2
++        vext.16     q8, q10, q10, #6
++        rsb         r6, r12, #32
++        vmov        q9, q10
++        sub         r5, #1
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r8, r7
++        vld1.32     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q11, q3
++        vmla.u16    q0, q10, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q9, q3
++        vmla.u16    q0, q8, q2
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        vld1.16     {q9}, [r1]!
++        rsb         r12, r6, #32
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vext.16     q8, q9, q9, #2
++        vld1.32     {d17[1]}, [r1]!
++        mov         r5, #3
++1:
++        vmul.u16    q0, q8, q2
++        subs        r12, r4
++        vmla.u16    q0, q9, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q10, q8, q8, #2
++        rsb         r6, r12, #32
++        vld1.32     {d21[1]}, [r1]
++        sub         r5, #1
++        vmov        q11, q8
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #4
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         1b
++        beq         4f
++2:
++        vmul.u16    q0, q10, q2
++        subs        r12, r4
++        vmla.u16    q0, q11, q3
++        it          cc
++        addcc       r12, #32
++        vext.16     q8, q10, q10, #2
++        rsb         r6, r12, #32
++        vld1.32     {d17[1]}, [r1]
++        sub         r5, #1
++        vmov        q9, q10
++        teq         r5, #0
++        vrshr.u16   q0, q0, #5
++        it          cc
++        addcc       r1, #4
++        vdup.16     q2, r6
++        vdup.16     q3, r12
++        vst1.16     {q0}, [r0], r3
++        bhi         2b
++        bne         1b
++        bcc         5f
++3:
++        vmul.u16    q0, q10, q2
++        vmla.u16    q0, q11, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++4:
++        bcc         3b
++5:
++        vmul.u16    q0, q8, q2
++        vmla.u16    q0, q9, q3
++        vrshr.u16   q0, q0, #5
++        vst1.16     {q0}, [r0]
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r8, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        mov         r1,  r2             @ save r2 - r1 unused by patch_down
++
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++
++        add         r2, r1, #4*4        @ restore r2, but 4 rows further down left
++        sub         r0, #32
++        mov         r6, r4
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++
++        pop         {r4-r8, pc}
++
++@ Up of Horizontal - works down up
++10:
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++
++        push        {r2, r8}
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++        pop         {r2, r8}
++
++        sub         r0, #32
++        mov         r6, r4
++        add         r2, #16
++        sub         r8, r8, r7, lsl #2
++        add         r0, r0, r3, lsl #2
++
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++
++        pop         {r4-r8, pc}
++
++@ Left of vertical - works down left
++18:
++        vld1.16     {q0-q1}, [r1]
++        sub         r9, r2, #4
++        rsb         r12, r6, #32
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        mov         r5, #8
++1:
++        vld1.32     {d17[1]}, [r9]
++        add         r8, r7
++        vmov        q2, q0
++        vmov        q3, q1
++        asr         r9, r8, #8
++        vext.16     q1, q0, q1, #6
++        add         r9, r2, r9, lsl #2
++        vext.16     q0, q8, q0, #6
++2:
++        vmul.u16    q11, q2, q10
++        subs        r12, r4
++        vmla.u16    q11, q0, q9
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q3, q10
++        rsb         r6, r12, #32
++        vmla.u16    q12, q1, q9
++        sub         r5, #1
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r8, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q0-q1}, [r1]!
++        rsb         r12, r6, #32
++        vld1.32     {d16[0]}, [r5]
++        mov         r5, #8
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++1:
++        vmov        q2, q0
++        add         r1, #4
++        vmov        q3, q1
++        vext.16     q0, q0, q1, #2
++        vext.16     q1, q1, q8, #2
++2:
++        vmul.u16    q11, q0, q9
++        subs        r12, r4
++        vmla.u16    q11, q2, q10
++        it          cc
++        addcc       r12, #32
++        vmul.u16    q12, q1, q9
++        rsb         r6, r12, #32
++        vmla.u16    q12, q3, q10
++        sub         r5, #1
++        vld1.32     {d16[0]}, [r1]
++        teq         r5, #0
++        vdup.16     q9, r6
++        vdup.16     q10, r12
++        vrshr.u16   q11, q11, #5
++        vrshr.u16   q12, q12, #5
++        vst1.16     {q11-q12}, [r0], r3
++        bhi         2b
++        bne         1b
++
++        pop         {r4-r8, pc}
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_angular_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride        [r3]
++@       unsigned int mode       [sp, #0]  2..34
++
++function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
++        ldr         r12, [sp]
++        push        {r4-r10, lr}
++        ADRT        r4, angle_2 - 2
++        ADRT        r7, inv_angle - 11*2
++        add         r7, r7, r12, lsl #1
++        lsl         r3, #2
++        vpush       {d8}
++        ldrsb       r6, [r4, r12]
++        cmp         r12, #26
++        ldrsb       r4, [r4, r12]
++        bge         26f
++        cmp         r12, #18
++        bge         18f
++        cmp         r12, #10
++        bge         10f
++
++@ Down of Horizontal - works down left
++        add         sp, #8
++        mov         r10, #4
++        mov         r1, r2
++1:
++        bl          patch_h_down_c_4x4_10
++        bl          patch_h_down_c_4x4_10_continue
++        bl          patch_h_down_c_4x4_10_continue
++        bl          patch_h_down_c_4x4_10_continue
++
++        add         r2, r1, #4*4         @ restore r2, but 4 rows further down left
++        add         r1, r1, #4*4
++        mov         r6, r4
++        sub         r0, #64
++        subs        r10, #1
++        add         r0, r0, r3, lsl #2
++        bne         1b
++
++        pop         {r4-r10, pc}
++
++@ Up of Horizontal - works down up
++10:
++        add         sp, #8
++        mov         r10, #4
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        sub         r8, r7
++2:
++        push        {r2, r8}
++        bl          patch_h_up_c_4x4_10
++        bl          patch_h_up_c_4x4_10_continue
++        bl          patch_h_up_c_4x4_10_continue
++        bl          patch_h_up_c_4x4_10_continue
++        pop         {r2, r8}
++
++        sub         r0, #64
++        mov         r6, r4
++        add         r2, #16
++        sub         r8, r8, r7, lsl #2
++        add         r0, r0, r3, lsl #2
++        subs        r10, #1
++        bne         2b
++
++        pop         {r4-r10, pc}
++
++@ Left of vertical - works down left
++18:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        sub         r9, r2, #4
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        ldrh        r7, [r7]
++        mov         r8, #-128
++        vmov        d0, d9
++        vmov        s2, r12
++        add         r10, r0, #32
++        mov         r5, #16
++1:
++        vld1.32     {d1[1]}, [r9]
++        add         r8, r7
++        vmov        q11, q4
++        vmov        q10, q3
++        asr         r9, r8, #8
++        vmov        q9, q2
++        add         r9, r2, r9, lsl #2
++        vmov        q8, q1
++        vext.16     q4, q3, q4, #6
++        vext.16     q3, q2, q3, #6
++        vext.16     q2, q1, q2, #6
++        vext.16     q1, q0, q1, #6
++2:
++        vmul.u16    q12, q8, d1[1]
++        adds        r12, r4
++        vmla.u16    q12, q1, d1[0]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q9, d1[1]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q2, d1[0]
++        sub         r5, #1
++        vmul.u16    q14, q10, d1[1]
++        teq         r5, #0
++        vmla.u16    q14, q3, d1[0]
++        vmul.u16    q15, q11, d1[1]
++        vmla.u16    q15, q4, d1[0]
++        vmov        s2, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d0
++        pop         {r4-r10, pc}
++
++@ Right of vertical - works along top - left unused
++26:
++        add         r5, r1, #32
++        vld1.16     {q1-q2}, [r1]
++        rsb         r12, r6, r6, lsl #16
++        vld1.16     {q3-q4}, [r5]
++        add         r1, r1, #64
++        rsb         r4, r12, #0
++        rsb         r12, r12, #32 << 16
++        vmov        d1, d9
++        vmov        s1, r12
++        add         r10, r0, #32
++        mov         r5, #16
++1:
++        vld1.32     {d0[0]}, [r1]!
++        vmov        q8, q1
++        vmov        q9, q2
++        vmov        q10, q3
++        vmov        q11, q4
++        vext.16     q1, q1, q2, #2
++        vext.16     q2, q2, q3, #2
++        vext.16     q3, q3, q4, #2
++        vext.16     q4, q4, q0, #2
++2:
++        vmul.u16    q12, q1, d0[2]
++        adds        r12, r4
++        vmla.u16    q12, q8, d0[3]
++        it          cc
++        addcc       r12, #32 << 16
++        vmul.u16    q13, q2, d0[2]
++        it          cc
++        subcc       r12, #32
++        vmla.u16    q13, q9, d0[3]
++        sub         r5, #1
++        vmul.u16    q14, q3, d0[2]
++        teq         r5, #0
++        vmla.u16    q14, q10, d0[3]
++        vmul.u16    q15, q4, d0[2]
++        vmla.u16    q15, q11, d0[3]
++        vmov        s1, r12
++        vrshr.u16   q12, q12, #5
++        vrshr.u16   q13, q13, #5
++        vrshr.u16   q14, q14, #5
++        vrshr.u16   q15, q15, #5
++        vst1.16     {q12-q13}, [r0], r3
++        vst1.16     {q14-q15}, [r10], r3
++        bhi         2b
++        bne         1b
++
++        vpop        {d8}
++        vmov        d9, d1
++        pop         {r4-r10, pc}
++
++endfunc
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+new file mode 100644
+index 0000000000..df8c1c25b9
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
+@@ -0,0 +1,705 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_8, export=1
++
++        @ Average the els of top & left
++        ldr         r2, [r2]
++        vld1.32     {d0[0]}, [r1]
++        mov         r1, #2
++        vmov        s1, r2
++        vmov        s2, r2
++        vmov.i16    q2, #3
++        add         r2, r0, r3
++        vaddl.u8    q1, d0, d1    @ d2[0] = top[0] + left[0]
++        lsl         r3, #1
++        vmovl.u8    q0, d0
++        vmov.i64    d7, #0xffff
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmov.i64    d7, #0xff
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #3
++        vmla.i16    q0, q2, d6[0]
++        vdup.8      d6, d6[0]
++        vrshrn.i16  d0, q0, #2
++
++        @ Store top line
++        vst1.32     {d0[0]}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d1, d0, #5*8
++        vshr.u64    d2, d0, #6*8
++        vshr.u64    d3, d0, #7*8
++        vbif        d1, d6, d7
++        vbif        d2, d6, d7
++        vst1.32     {d1[0]}, [r2], r3
++        vbif        d3, d6, d7
++        vst1.32     {d2[0]}, [r0]
++        vst1.32     {d3[0]}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {d0}, [r1]
++        vld1.8      {d1}, [r2]
++A       add         r2, r0, r3, lsl #1
++A       lsl         r3, #2
++T       lsl         r3, #1
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++        vadd.i16    d0, d1       @ d0 has 2 val pairs
++        vpadd.i32   d2, d0, d0   @ This adds U & V separately
++        vpadd.i32   d3, d0, d0
++        vrshrn.u16  d0, q1, #3
++
++        @ Store
++        vst1.8      {d0}, [r0], r3
++        vst1.8      {d0}, [r2], r3
++        vst1.8      {d0}, [r0]
++        vst1.8      {d0}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {d0}, [r1]
++        mov         r1, #2
++        vld1.8      {d16}, [r2]
++        vmov.i16    q2, #3
++        vmov.i64    d7, #0xffff
++        vaddl.u8    q1, d0, d16   @ d2[0] = top[0] + left[0]
++        vmovl.u8    q0, d0
++        vadd.i16    d6, d2, d3    @ d6 has 4 vals
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmov.i64    d7, #0xff
++        vmovl.u8    q1, d16
++        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #4
++        vmla.i16    q1, q2, d6[0]
++        vmla.i16    q0, q2, d6[0]
++        vdup.8      d6, d6[0]
++        vrshrn.i16  d2, q1, #2
++        vrshrn.i16  d0, q0, #2
++
++        @ Store top line
++        vst1.8      {d0}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d2, #8
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        mov         r1, #6
++1:
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        subs        r1, #2
++        vbit        d6, d2, d7
++        vshr.u64    d2, #8
++        vst1.8      {d6}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0}, [r1]
++        mov         r1, #8
++        vld1.8      {q1}, [r2]
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++A       add         r2, r0, r3, lsl #1
++A       lsl         r3, #2
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vaddl.u8    q1, d2, d3
++        vadd.i16    q1, q0
++        vadd.i16    d3, d2        @ d3 has 2 val pairs
++        vpadd.i32   d2, d3, d3    @ This add U & V separately
++        vpadd.i32   d3, d3, d3
++        vrshrn.u16  d0, q1, #4
++        vrshrn.u16  d1, q1, #4
++
++        @ Store
++1:
++        vst1.8      {q0}, [r0], r3
++        subs        r1, #4
++        vst1.8      {q0}, [r2], r3
++        vst1.8      {q0}, [r0], r3
++        vst1.8      {q0}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q8}, [r1]
++        mov         r1, #2
++        vld1.8      {q9}, [r2]
++        vaddl.u8    q10, d16, d17
++        vaddl.u8    q11, d16, d18
++        vaddl.u8    q0, d18, d19
++        vmov.i16    q1, #3
++        vadd.i16    q10, q0
++        vmovl.u8    q0, d18
++        vadd.i16    d20, d21
++        vmov.i16    d2[0], r1     @ 2, 3, 3, 3...
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vmovl.u8    q2, d16
++        vmovl.u8    q9, d19
++        vpadd.i16   d20, d20      @ 2 (top & bottom of vector the same)
++        vmov.i64    d7, #0xffff
++        vmovl.u8    q8, d17
++        vbit        d4, d22, d7   @ q2 = top[0]+left[0], top[1..7]
++        vmov.i64    d7, #0xff
++        vpadd.i16   d20, d20      @ 1 (all the same)
++        vrshr.u16   d21, d20, #5
++        vrshr.u16   d20, d20, #5
++        vmla.i16    q0, q10, d2[1]
++        vmla.i16    q9, q10, d2[1]
++        vmla.i16    q2, q10, q1
++        vmla.i16    q8, q10, d2[1]
++        vdup.8      q1, d20[0]
++        vrshrn.i16  d0, q0, #2
++        vrshrn.i16  d1, q9, #2
++        vrshrn.i16  d4, q2, #2
++        vrshrn.i16  d5, q8, #2
++        vext.8      q0, q0, q0, #1
++
++        @ Store top line
++        vst1.8      {q2}, [r0], r3
++
++        @ Store the rest
++        mov         r1, #15
++1:
++        vbit        d2, d0, d7
++        vext.8      q0, q0, q0, #1
++        subs        r1, #1
++        vst1.8      {q1}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0-q1}, [r1]
++        mov         r1, #16
++        vld1.8      {q2-q3}, [r2]
++T       lsl         r3, #1
++        vaddl.u8    q0, d0, d1
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vaddl.u8    q1, d2, d3
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vaddl.u8    q2, d4, d5
++        vaddl.u8    q3, d6, d7
++        vadd.i16    q0, q1
++        vadd.i16    q2, q3
++        vadd.i16    q0, q2
++        vadd.i16    d0, d1        @ d0 has 2 val pairs
++        vpadd.i32   d4, d0, d0    @ This adds U & V separately
++        vpadd.i32   d5, d0, d0
++        vrshrn.u16  d0, q2, #5
++        vrshrn.u16  d1, q2, #5
++        vrshrn.u16  d2, q2, #5
++        vrshrn.u16  d3, q2, #5
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_32_neon_8, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0-q1}, [r1]
++        mov         r1, #32
++        vld1.8      {q2-q3}, [r2]
++        add         r2, r0, r3
++        vaddl.u8    q0, d0, d1
++        lsl         r3, #1
++        vaddl.u8    q1, d2, d3
++        vaddl.u8    q2, d4, d5
++        vaddl.u8    q3, d6, d7
++        vadd.i16    q0, q1
++        vadd.i16    q2, q3
++        vadd.i16    q0, q2
++        vadd.i16    d0, d1        @ d0 has 4 vals
++        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
++        vpadd.i16   d4, d0, d0    @ 1 (all the same)
++        vpadd.i16   d5, d0, d0
++        vrshrn.u16  d0, q2, #6
++        vrshrn.u16  d1, q2, #6
++        vrshrn.u16  d2, q2, #6
++        vrshrn.u16  d3, q2, #6
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ -----------------------------------------------------------------------------
++@
++@ 10 Bit versions
++@
++@ There is no actual bit depth dependency in this code except that our
++@ intermediate results will overflow the 16 bits they are stored in
++@ All there functions are good to 10 bits - with the worst case being
++@ in dc_32 where we use all 16 bits.
++
++
++@ ff_hevc_rpi_pred_dc_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_4_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {d0}, [r1]
++        mov         r1, #2
++        vld1.16     {d1}, [r2]
++T       lsl         r3, #1
++        vmov.i16    q2, #3
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.u16    d2, d0, d1    @ d2[0] = top[0] + left[0]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vmov.i64    d7, #0xffff
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..3], left[0..3]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vpadd.i16   d6, d2, d2    @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #3
++        vmla.i16    q0, q2, d6[0]
++        vrshr.u16   q0, #2
++
++        @ Store top line
++        vst1.16     {d0}, [r0], r3
++
++        @ Store the rest
++        vshr.u64    d3, d1, #1*16
++        vshr.u64    d4, d1, #2*16
++        vshr.u64    d5, d1, #3*16
++        vbif        d3, d6, d7
++        vbif        d4, d6, d7
++        vst1.16     {d3}, [r2], r3
++        vbif        d5, d6, d7
++        vst1.16     {d4}, [r0]
++        vst1.16     {d5}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.8      {q0}, [r1]
++        vld1.8      {q1}, [r2]
++A       add         r2, r0, r3, lsl #2
++A       lsl         r3, #3
++T       lsl         r3, #2
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vadd.i16    q0, q1
++        vadd.i16    d0, d1       @ d0 has 2 val pairs
++        vpadd.i32   d2, d0, d0   @ This adds U & V separately
++        vpadd.i32   d3, d0, d0
++        vrshr.u16   q0, q1, #3
++
++        vst1.16     {q0}, [r0], r3
++        vst1.16     {q0}, [r2], r3
++        vst1.16     {q0}, [r0]
++        vst1.16     {q0}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_8_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q0}, [r1]
++        mov         r1, #2
++        vld1.16     {q8}, [r2]
++T       lsl         r3, #1
++        vmov.i16    q2, #3
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.i16    q1, q0, q8    @ q1[0] = top[0] + left[0]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmov.i64    d7, #0xffff
++        vmov.16     d4[0], r1     @ 2, 3, 3, 3...
++        vadd.i16    d6, d2, d3    @ d6 has 4 vals
++        vbit        d0, d2, d7    @ q0 = top[0]+left[0], top[1..7]
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ top_line[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vpadd.i16   d6, d6        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d6, d6        @ 1 (all the same)
++        vrshr.u16   d6, #4
++        vmla.i16    q8, q2, d6[0]
++        vmla.i16    q0, q2, d6[0]
++        vdup.16     q2, d6[0]
++        vdup.16     q9, d6[0]
++        vrshr.u16   q8, q8, #2
++        vrshr.u16   q0, q0, #2
++        vext.16     q1, q8, q8, #1
++
++        @ Store top line
++        vst1.16     {q0}, [r0], r3
++
++        @ Store the rest
++        vbit        d18, d2, d7
++        vst1.16     {q9}, [r2], r3
++        mov         r1, #6
++1:
++        vext.16     q8, q8, q8, #2
++        subs        r1, #2
++        vext.16     q1, q1, q1, #2
++        vbit        d4, d16, d7
++        vst1.16     {q2}, [r0], r3
++        vbit        d18, d2, d7
++        vst1.16     {q9}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q0-q1}, [r1]
++        mov         r1, #8
++        vld1.16     {q2-q3}, [r2]
++T       lsl         r3, #2
++        vadd.i16    q1, q0
++A       add         r2, r0, r3, lsl #2
++A       lsl         r3, #3
++T       add         r2, r0, r3
++T       lsl         r3, #1
++        vadd.i16    q2, q3
++        vadd.i16    q1, q2
++        vadd.i16    d3, d2        @ d3 has 2 val pairs
++        vpadd.i32   d2, d3, d3    @ This add U & V separately
++        vpadd.i32   d3, d3, d3
++        vrshr.u16   q0, q1, #4
++        vrshr.u16   q1, q1, #4
++
++        @ Store
++1:
++        vst1.8      {q0-q1}, [r0], r3
++        subs        r1, #2
++        vst1.8      {q0-q1}, [r2], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_dc_16_neon_10, export=1
++
++        @ Average the els of top & left
++        vld1.16     {q8-q9}, [r1]
++        mov         r1, #2
++        vld1.16     {q10-q11}, [r2]
++        lsl         r3, #1        @ stride given in pels
++        vadd.i16    q0, q8, q9
++        vadd.i16    q1, q10, q11
++        vmov.i16    q3, #3
++        vadd.i16    q1, q0
++        vadd.i16    d0, d16, d20
++        vmov.i64    d31, #0xffff
++        vadd.i16    d3, d2
++        vmov.16     d6[0], r1     @ 2, 3, 3, 3...
++
++        @ top line gets some smoothing
++        @ (top[i] + 3*dc + 2) >> 2
++        @ as does left
++        @ topline[0] is extra special
++        @ (top[0] + left[0] + 2*dc + 2) >> 2
++
++        vbit        d16, d0, d31  @ q8 = top[0]+left[0], top[1..7]
++        vpadd.i16   d3, d3        @ 2 (top & bottom of vector the same)
++        vpadd.i16   d3, d3        @ 1 (all the same)
++        vrshr.u16   d2, d3, #5
++        vrshr.u16   d3, d3, #5
++        vmov        q0, q1
++        vmla.i16    q10, q1, d6[1]
++        vmla.i16    q11, q1, d6[1]
++        vmla.i16    q8, q1, q3
++        vmla.i16    q9, q1, d6[1]
++        vrshr.u16   q2, q10, #2
++        vrshr.u16   q3, q11, #2
++        vrshr.u16   q8, #2
++        vrshr.u16   q9, #2
++        vext.16     q2, q2, q2, #1
++        mov         r1, #7<<29
++
++        @ Store top line
++        vst1.16     {q8-q9}, [r0], r3
++
++        @ Store the rest
++1:
++        vbit        d0, d4, d31
++        vext.16     q2, q2, q2, #1
++        subs        r1, #1<<29
++        vst1.16     {q0-q1}, [r0], r3
++        bne         1b
++1:
++        vbit        d0, d6, d31
++        vext.16     q3, q3, q3, #1
++        subs        r1, #1<<29
++        vst1.16     {q0-q1}, [r0], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels - needs * 4)
++
++function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
++
++        @ Average the els of top & left
++        vldm        r1, {q0-q3}
++        vldm        r2, {q8-q11}
++        vadd.i16    q0, q1
++        mov         r1, #16
++        vadd.i16    q2, q3
++        add         r2, r0, #32
++        vadd.i16    q8, q9
++        lsl         r3, #2
++        vadd.i16    q10, q11
++        vadd.u16    q0, q2
++        vadd.u16    q8, q10
++        vadd.i16    q0, q8
++        vadd.i16    d0, d1        @ d0 has 2 val pairs
++        vpadd.i32   d4, d0, d0    @ This adds U & V separately
++        vpadd.i32   d5, d0, d0
++        vrshr.u16   q0, q2, #5
++        vrshr.u16   q1, q2, #5
++
++        @ Store
++1:
++        vst1.16     {q0-q1}, [r0], r3
++        subs        r1, #1
++        vst1.16     {q0-q1}, [r2], r3
++        bne         1b
++
++        bx           lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_dc_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]  (In pels)
++
++function ff_hevc_rpi_pred_dc_32_neon_10, export=1
++
++        @ Average the els of top & left
++        @ With 10 bits we are (just) safe from overflow in i16
++        vldm        r1, {q0-q3}
++        vldm        r2, {q8-q11}
++        vadd.i16    q0, q1
++        mov         r1, #32
++        vadd.i16    q2, q3
++        add         r2, r0, #32
++        vadd.i16    q8, q9
++        lsl         r3, #1
++        vadd.i16    q10, q11
++        vadd.u16    q0, q2
++        vadd.u16    q8, q10
++        vadd.i16    q0, q8
++        vadd.i16    d0, d1        @ d0 has 4 vals
++        vpadd.i16   d0, d0        @ 2 (top & bottom the same)
++        vpadd.i16   d4, d0, d0    @ 1 (all the same)
++        vpadd.i16   d5, d0, d0
++        vrshr.u16   q0, q2, #6
++        vrshr.u16   q1, q2, #6
++
++        @ Store
++1:
++        vst1.16     {q0-q1}, [r0], r3
++        subs        r1, #1
++        vst1.16     {q0-q1}, [r2], r3
++        bne         1b
++
++        bx           lr
++endfunc
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+new file mode 100644
+index 0000000000..f6969d3591
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
+@@ -0,0 +1,881 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ All functions have the call
++@
++@ int ff_hevc_rpi_intra_filter_N_neon_PW(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++@
++@ Assumptions:
++@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
++@  if reuseing this code)
++@
++@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
++@ N==4, but do for chroma N>=8.  As we share Y/C fns that means we can ignore
++@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
++@
++@ We always have at least 64 pixel H frame width rounding - this lets us
++@ load UR widthout having to worry about exactly how many pixels are actually
++@ within the frame.  As partial loads will only occur very occasionally this
++@ should be a win in nearly all cases.
++@
++@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
++@ so we do no maths on the contents
++@
++@ No filtering in 32bit fns as they are chroma only
++
++
++.equ    AVAIL_UR, 1
++.equ    AVAIL_U,  2
++.equ    AVAIL_UL, 4
++.equ    AVAIL_L,  8
++.equ    AVAIL_DL, 16
++
++.equ    FILTER_LIGHT, 0x40
++.equ    FILTER_STRONG, 0x80
++
++.equ    AVAIL_S_UR_N_U_C, 32 - 1
++.equ    AVAIL_S_U_N_UL_C, 32 - 2
++.equ    AVAIL_S_UL_N_L_C, 32 - 3
++.equ    AVAIL_S_L_N_DL_C, 32 - 4
++
++.equ    AVAIL_S_U_DL_CPSR, 31 - 4  @ Shift for u..dl to go into flags via cpsr
++
++@ On entry
++@  r2   req
++@  r3   avail
++@ [sp, #sp_offset...]  args
++@
++@ On Exit:
++@
++@ Extend values:
++@  d_l  scalar contains value for L & DL
++@       if DL avail then this is is DL[0] so we don't need to load that
++@  d_ul scalar containing value for UL
++@  d_u  scalar containing value for U
++@  d_ur scalar containing value for UR
++@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
++@ This means that L-light-filter works even if nreq DL (we never filter
++@ req-DL without req-L, but we do filter req-L without req-DL)
++@ If UR avail then d_ur == a_ur so U-filter good too
++@
++@ Data load pointers (only load if req & avail):
++@  r4   DL + stride
++@  r10  L
++@  r6   U
++@  r5   UR
++@
++@ Others:
++@  r2   req
++@  r7   req & avail
++@  r3   L + stride
++@  r8   DL + stride * 2
++@  r9   stride * 2
++@  cs   Load U
++@  mi   Load UR
++@
++@ Clobbered:
++@  r12
++
++.macro  load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
++
++.equ    src_l\@,   \sp_offset + 0
++.equ    src_u\@,   \sp_offset + 4
++.equ    src_ur\@,  \sp_offset + 8
++.equ    stride\@,  \sp_offset + 12
++.equ    pw\@,      (1 << \pw_s)                 @ pel width in bytes
++.equ    b_size\@,  (1 << (\pw_s + \log2_s))     @ size in bytes
++
++@ r9    stride
++@                       r7 = ab_ul, r6 = a_u, r5 = a_ur
++@ r4 = b_dl, r10 = b_l,             r8 = b_u
++
++        ldr        r5,  [sp, #src_ur\@]
++        lsl        r12, r3,  #AVAIL_S_U_DL_CPSR
++        ldr        r10, [sp, #src_l\@]
++        ldr        r9,  [sp, #stride\@]
++        ldr        r6,  [sp, #src_u\@]
++
++        @ This is quite a slow instruction but it replaces
++        @ a decent number of tests that yield a max of 2 flags/op
++        @ It is annoying we can't branch on Q!
++        @ If L navail (ne) then DL must be navail (pl)
++        msr        APSR_nzcvq, r12      @ n=dl, z=l, c=ul, v=u, q=ur
++
++        mov        r4,  r5
++        sub        r7,  r10, r9
++        it vs
++        movvs      r4,  r6
++        add        r8,  r6,  #b_size\@ - pw\@
++        it cs
++        movcs      r4,  r7
++        ite ne
++        movne      r10, r4
++        addeq      r4,  r7,  r9,  lsl #\log2_s
++        it cc
++        movcc      r7,  r10
++        it mi
++        addmi      r4,  r10, r9,  lsl #\log2_s
++        vld1.\d_type {\d_ul}, [r7]
++        itt vc
++        movvc      r8,  r7
++        movvc      r6,  r7
++        vld1.\d_type {\d_l }, [r4], r9
++        tst        r3,  #AVAIL_UR
++        vld1.\d_type {\d_u }, [r6]
++        it eq
++        moveq      r5,  r8
++        and        r7,  r2,  r3
++        add        r8,  r4,  r9
++        vld1.\d_type {\d_ur}, [r5]
++        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
++        add        r3,  r10, r9
++        lsl        r9,  #1
++.endm
++
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_8(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    0
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_8, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
++
++        it cs
++        vldrcs     s2,  [r6]
++        ite pl
++        vmovpl     s3,  s4
++        vldrmi     s3,  [r5]
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        add        r12, r0,  #-pw
++        bpl        1f
++
++        vld1.8    {d0[0]}, [r10], r9
++        vld1.8    {d0[1]}, [r3],  r9
++        vld1.8    {d0[2]}, [r10]
++        vld1.8    {d0[3]}, [r3]
++1:
++        bcc        1f
++        vld1.8    {d0[5]}, [r4],  r9
++        vld1.8    {d0[6]}, [r8]
++        vld1.8    {d0[7]}, [r4]
++1:
++        vstr       d1,  [r1]            @ Up
++        vst1.8    {d31[7]}, [r12]
++        vstr       d0,  [r0]            @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_4_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_16, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
++
++        it cs
++        vldrcs     d2,  [r6]
++        it mi
++        vldrmi     d3,  [r5]
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        add        r12, r0, #-pw
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10]
++        vld1.16   {d0[3]}, [r3]
++1:
++        bcc        1f
++        vld1.16   {d1[1]}, [r4],  r9
++        vld1.16   {d1[2]}, [r8]
++        vld1.16   {d1[3]}, [r4]
++1:
++        vst1.16   {q1}, [r1]           @ Up
++        vst1.16   {d31[3]}, [r12]
++        vst1.16   {q0}, [r0]           @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_8(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    0
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++
++function ff_hevc_rpi_intra_filter_8_neon_8, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
++
++        it cs
++        vldrcs     d4,  [r6]
++        it mi
++        vldrmi     d5,  [r5]
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        bpl        1f
++        vld1.8    {d0[0]}, [r10], r9
++        vld1.8    {d0[1]}, [r3],  r9
++        vld1.8    {d0[2]}, [r10], r9
++        vld1.8    {d0[3]}, [r3],  r9
++        vld1.8    {d0[4]}, [r10], r9
++        vld1.8    {d0[5]}, [r3],  r9
++        vld1.8    {d0[6]}, [r10]
++        vld1.8    {d0[7]}, [r3]
++1:
++        bcc        1f
++        vld1.8    {d1[1]}, [r4],  r9
++        vld1.8    {d1[2]}, [r8],  r9
++        vld1.8    {d1[3]}, [r4],  r9
++        vld1.8    {d1[4]}, [r8],  r9
++        vld1.8    {d1[5]}, [r4],  r9
++        vld1.8    {d1[6]}, [r8]
++        vld1.8    {d1[7]}, [r4]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        @ Luma light filter
++        vext.8     q8,  q15, q2,  #15
++        vext.8     q12, q15, q0,  #15
++        vaddl.u8   q9,  d17, d5
++        vaddl.u8   q8,  d16, d4
++        vaddl.u8   q13, d25, d1
++        vaddl.u8   q12, d24, d0
++        vmov.u8    r3,  d5[7]           @ Save final pel
++        vmov.u8    r2,  d1[7]           @ Save final pel
++
++        vext.16    q2,  q8,  q9,  #1
++        vext.16    q3,  q9,  q9,  #1
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q13, #1
++        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q2,  q8
++        vadd.u16   q3,  q9
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++
++        vrshrn.u16 d4,  q2,  #2
++        vrshrn.u16 d5,  q3,  #2
++        vrshrn.u16 d0,  q0,  #2
++        vrshrn.u16 d1,  q1,  #2
++        vrshr.u16  d30, #2
++        vmov.u8    d5[7], r3            @ Restore final pel
++        vmov.u8    d1[7], r2            @ Restore final pel
++        vdup.u8    d31, d30[0]          @ d31[3] = d30[0]
++
++10:
++        vst1.8    {q2 }, [r1]           @ Up
++        vst1.8    {d31[7]}, [r12]       @ Up-left
++        vst1.8    {q0 }, [r0]           @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_16, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
++
++        it cs
++        vldmcs     r6,  {d4, d5}
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #4
++        vldm       r5,  {d6, d7}
++        bgt        1f
++        vdup.16    d7,  d6[3]
++1:
++        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
++        vdup.16    q1,  d0[0]
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10], r9
++        vld1.16   {d0[3]}, [r3],  r9
++        vld1.16   {d1[0]}, [r10], r9
++        vld1.16   {d1[1]}, [r3],  r9
++        vld1.16   {d1[2]}, [r10]
++        vld1.16   {d1[3]}, [r3]
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vld1.16   {d2[1]}, [r4],  r9
++        cmp        r12, #p_size
++        vld1.16   {d2[2]}, [r8],  r9
++        vld1.16   {d2[3]}, [r4],  r9
++        blt        2f
++        vld1.16   {d3[0]}, [r8],  r9
++        vld1.16   {d3[1]}, [r4],  r9
++        vld1.16   {d3[2]}, [r8]
++        vld1.16   {d3[3]}, [r4]
++        b          1f
++2:
++        vdup.16    d3,  d2[3]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        @ Luma light filter
++        vext.16    q9,  q2,  q3,  #7
++        vext.16    q8,  q15, q2,  #7
++        vext.16    q13, q0,  q1,  #7
++        vext.16    q12, q15, q0,  #7
++        vadd.u16   q9,  q3
++        vadd.u16   q8,  q2
++        vadd.u16   q13, q1
++        vadd.u16   q12, q0
++        vmov.u16   r3,  d7[3]           @ Save final pel
++        vmov.u16   r2,  d3[3]           @ Save final pel
++
++        vext.16    q2,  q8,  q9,  #1
++        vext.16    q3,  q9,  q9,  #1
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q13, #1
++        vadd.u16   d30, d16, d24        @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q2,  q8
++        vadd.u16   q3,  q9
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++
++        vrshr.u16  q2,  #2
++        vrshr.u16  q3,  #2
++        vrshr.u16  q0,  #2
++        vrshr.u16  q1,  #2
++        vrshr.u16  d30, #2
++        vmov.u16   d7[3], r3            @ Restore final pel
++        vmov.u16   d3[3], r2            @ Restore final pel
++        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
++
++10:
++        vst1.16   {q2,  q3}, [r1]       @ Up
++        vst1.16   {d31[3]}, [r12]       @ Up-left
++        vst1.16   {q0,  q1}, [r0]       @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_16_neon_16(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    1
++.set    pw,      (1 << pw_s)
++.set    log2_s,  4
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_16, export=1
++        push      {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
++
++        vdup.16    q9,  d16[0]
++        vdup.16    q11, d20[0]
++
++        it cs
++        vldmcs     r6,  {d16-d19}
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #12
++        @ Given chroma frame layout, if UR exists then it is always legit to
++        @ load all of it even if most of it is outside the frame.
++        vldm       r5,  {d20-d23}
++        bgt        1f
++        bge        4f
++        cmp        r12,  #8
++        bge        3f
++        vdup.16    d21, d20[3]
++3:      vdup.16    d22, d21[3]
++4:      vdup.16    d23, d22[3]
++
++1:
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        ldr        r12, [sp, #dl_size]
++        vdup.16    q1,  d0[0]
++        vdup.16    q2,  d0[0]
++        vdup.16    q3,  d0[0]
++        bpl        1f
++        vld1.16   {d0[0]}, [r10], r9
++        vld1.16   {d0[1]}, [r3],  r9
++        vld1.16   {d0[2]}, [r10], r9
++        vld1.16   {d0[3]}, [r3],  r9
++        vld1.16   {d1[0]}, [r10], r9
++        vld1.16   {d1[1]}, [r3],  r9
++        vld1.16   {d1[2]}, [r10], r9
++        vld1.16   {d1[3]}, [r3],  r9
++        vld1.16   {d2[0]}, [r10], r9
++        vld1.16   {d2[1]}, [r3],  r9
++        vld1.16   {d2[2]}, [r10], r9
++        vld1.16   {d2[3]}, [r3],  r9
++        vld1.16   {d3[0]}, [r10], r9
++        vld1.16   {d3[1]}, [r3],  r9
++        vld1.16   {d3[2]}, [r10]
++        vld1.16   {d3[3]}, [r3]
++1:
++        bcc        1f
++        vld1.16   {d4[1]}, [r4],  r9
++        cmp        r12, #4
++        vld1.16   {d4[2]}, [r8],  r9
++        vld1.16   {d4[3]}, [r4],  r9
++        ble        2f
++        vld1.16   {d5[0]}, [r8],  r9
++        vld1.16   {d5[1]}, [r4],  r9
++        cmp        r12, #12
++        vld1.16   {d5[2]}, [r8],  r9
++        vld1.16   {d5[3]}, [r4],  r9
++        blt        3f
++        vld1.16   {d6[0]}, [r8],  r9
++        vld1.16   {d6[1]}, [r4],  r9
++        vld1.16   {d6[2]}, [r8],  r9
++        vld1.16   {d6[3]}, [r4],  r9
++        ble        4f
++        vld1.16   {d7[0]}, [r8],  r9
++        vld1.16   {d7[1]}, [r4],  r9
++        vld1.16   {d7[2]}, [r8]
++        vld1.16   {d7[3]}, [r4]
++        b          1f
++2:      vdup.16    d5,  d4[3]
++3:      vdup.16    d6,  d5[3]
++4:      vdup.16    d7,  d6[3]
++1:
++        tst        r2,  #FILTER_LIGHT
++        add        r12, r0,  #-pw
++        beq        10f
++
++        vpush     {q5}
++        @ Luma light filter
++        @ Left
++        vext.16    q5,  q2,  q3,  #7
++        vext.16    q14, q1,  q2,  #7
++        vext.16    q13, q0,  q1,  #7
++        vext.16    q12, q15, q0,  #7
++
++        vadd.u16   q5,  q3
++        vadd.u16   q14, q2
++        vadd.u16   q13, q1
++        vadd.u16   q12, q0
++        vmov.u16   r2,  d7[3]           @ Save final pel
++
++        vext.16    q0,  q12, q13, #1
++        vext.16    q1,  q13, q14, #1
++        vext.16    q2,  q14, q5,  #1
++        vext.16    q3,  q5,  q5,  #1
++
++        vmov       d30, d24             @ d30[0] = l[0] + ul
++        vadd.u16   q0,  q12
++        vadd.u16   q1,  q13
++        vadd.u16   q2,  q14
++        vadd.u16   q3,  q5
++
++        vrshr.u16  q0,  #2
++        vrshr.u16  q1,  #2
++        vrshr.u16  q2,  #2
++        vrshr.u16  q3,  #2
++
++        @ Up
++        vext.16    q5,  q10, q11, #7
++        vext.16    q14, q9,  q10, #7
++        vext.16    q13, q8,  q9,  #7
++        vext.16    q12, q15, q8,  #7
++
++        vadd.u16   q5,  q11
++        vadd.u16   q14, q10
++        vadd.u16   q13, q9
++        vadd.u16   q12, q8
++        vmov.u16   r3,  d23[3]          @ Save final pel
++
++        vext.16    q8,  q12, q13, #1
++        vext.16    q9,  q13, q14, #1
++        vext.16    q10, q14, q5,  #1
++        vext.16    q11, q5,  q5,  #1
++
++        vadd.u16   d30, d24             @ d30[0] = l[0] + 2ul + u[0]
++        vadd.u16   q8,  q12
++        vadd.u16   q9,  q13
++        vadd.u16   q10, q14
++        vadd.u16   q11, q5
++
++        vrshr.u16  q8,  #2
++        vrshr.u16  q9,  #2
++        vrshr.u16  q10, #2
++        vrshr.u16  q11, #2
++
++        @ Misc
++        vrshr.u16  d30, #2
++        vmov.u16   d7[3], r2            @ Restore final pel
++        vmov.u16   d23[3], r3           @ Restore final pel
++        vdup.u16   d31, d30[0]          @ d31[3] = d30[0]
++        vpop      {q5}
++
++10:
++        vstm       r1, {d16-d23}        @ Up
++        vst1.16   {d31[3]}, [r12]       @ Up-left
++        vstm       r0, { d0-d7 }        @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++@ int ff_hevc_rpi_intra_filter_4_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  2
++
++function ff_hevc_rpi_intra_filter_4_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
++
++        it cs
++        vldmcs     r6,  {d4, d5}
++        it mi
++        vldmmi     r5,  {d6, d7}
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q1,  d0[0]
++        add        r12, r0,  #-pw
++        bpl        1f
++        vld1.32   {d0[0]}, [r10], r9
++        vld1.32   {d0[1]}, [r3],  r9
++        vld1.32   {d1[0]}, [r10]
++        vld1.32   {d1[1]}, [r3]
++1:
++        bcc        1f
++        vld1.32   {d2[1]}, [r4],  r9
++        vld1.32   {d3[0]}, [r8]
++        vld1.32   {d3[1]}, [r4]
++1:
++        vst1.32    {q2,  q3 }, [r1]     @ Up
++        vst1.32    {d31[1]}, [r12]
++        vst1.32    {q0,  q1 }, [r0]     @ Left
++        pop        {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_8_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  3
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_8_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
++
++        vdup.32    q9,  d16[0]
++        vdup.32    q11, d20[0]
++
++        it cs
++        vldmcs     r6,  {q8,  q9 }
++        ldr        r12, [sp, #ur_size]
++        bpl        1f
++        cmp        r12, #p_size
++        vldm       r5,  {q10, q11}
++        bge        1f
++        vdup.32    q11, d21[1]
++1:
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q1,  d0[0]
++        vdup.32    q2,  d0[0]
++        vdup.32    q3,  d0[0]
++        bpl        1f
++        vld1.32   {d0[0]}, [r10], r9
++        vld1.32   {d0[1]}, [r3],  r9
++        vld1.32   {d1[0]}, [r10], r9
++        vld1.32   {d1[1]}, [r3],  r9
++        vld1.32   {d2[0]}, [r10], r9
++        vld1.32   {d2[1]}, [r3],  r9
++        vld1.32   {d3[0]}, [r10]
++        vld1.32   {d3[1]}, [r3]
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vld1.32   {d4[1]}, [r4],  r9
++        cmp        r12, #p_size
++        vld1.32   {d5[0]}, [r8],  r9
++        vld1.32   {d5[1]}, [r4],  r9
++        blt        2f
++        vld1.32   {d6[0]}, [r8],  r9
++        vld1.32   {d6[1]}, [r4],  r9
++        vld1.32   {d7[0]}, [r8]
++        vld1.32   {d7[1]}, [r4]
++        b          1f
++2:
++        vdup.32    q3,  d5[1]
++1:
++        add        r12, r0,  #-pw
++        vstm       r1,  { q8-q11}       @ Up
++        vst1.32   {d31[1]}, [r12]
++        vstm       r0,  { q0-q3 }       @ Left
++        pop       {r4-r10, pc}
++endfunc
++
++
++@ int ff_hevc_rpi_intra_filter_16_neon_32(
++@    pixel * const left,                   [r0]
++@    pixel * const top,                    [r1]
++@    const unsigned int req,               [r2]
++@    const unsigned int avail,             [r3]
++@    const pixel * const src_l,            [sp, #0]
++@    const pixel * const src_u,            [sp, #4]
++@    const pixel * const src_ur,           [sp, #8]
++@    const unsigned int stride,            [sp, #12] (pels)
++@    const unsigned int top_right_size,    [sp, #16]
++@    const unsigned int down_left_size)    [sp, #20]
++
++.set    sp_base, 8*4
++.set    ur_size, sp_base + 16
++.set    dl_size, sp_base + 20
++.set    pw_s,    2
++.set    pw,      (1 << pw_s)
++.set    log2_s,  4
++.set    p_size,  (1 << log2_s)          @ size in pels
++
++function ff_hevc_rpi_intra_filter_16_neon_32, export=1
++        push       {r4-r10, lr}
++        load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
++
++        @ Once we get this big we have run out of neon regs to store
++        @ everything at once so do in pieces
++
++        @ Up (have)
++        it cs
++        vldmcs     r6,  { q0-q3 }
++        ldr        r12, [sp, #ur_size]
++        it mi
++        vldmmi     r5,  { q8-q11}
++        it cs
++        vstmcs     r1,  { q0-q3 }
++        bpl        1f
++        cmp        r12, #12
++        add        lr,  r1,  #(pw << log2_s)
++        bgt        2f
++        cmp        r12, #8
++        bge        3f
++        vdup.16    q9,  d17[1]
++4:      vdup.16    d10, d19[1]
++3:      vdup.16    q11, d21[1]
++2:      vstm       lr, { q8-q11}
++1:
++
++        @ Left (have)
++        add        lr,  r0,  #-pw
++        lsls       r12, r7,  #AVAIL_S_L_N_DL_C
++        vst1.32   {d30[1]}, [lr]        @ UL
++        bpl        1f
++        vld1.32   { d0[0]}, [r10], r9
++        vld1.32   { d0[1]}, [r3],  r9
++        vld1.32   { d1[0]}, [r10], r9
++        vld1.32   { d1[1]}, [r3],  r9
++        vld1.32   { d2[0]}, [r10], r9
++        vld1.32   { d2[1]}, [r3],  r9
++        vld1.32   { d3[0]}, [r10], r9
++        vld1.32   { d3[1]}, [r3],  r9
++        vld1.32   { d4[0]}, [r10], r9
++        vld1.32   { d4[1]}, [r3],  r9
++        vld1.32   { d5[0]}, [r10], r9
++        vld1.32   { d5[1]}, [r3],  r9
++        vld1.32   { d6[0]}, [r10], r9
++        vld1.32   { d6[1]}, [r3],  r9
++        vld1.32   { d7[0]}, [r10]
++        vld1.32   { d7[1]}, [r3]
++        vstm       r0,  { q0-q3 }
++1:
++        bcc        1f
++        ldr        r12, [sp, #dl_size]
++        vdup.32    d16, d30[0]          @ d16[0] = d30[0]
++        add        lr,  r0,  #(pw << log2_s)
++        vld1.32   {d16[1]}, [r4],  r9
++        cmp        r12, #4
++        vld1.32   {d17[0]}, [r8],  r9
++        vld1.32   {d17[1]}, [r4],  r9
++        ble        2f
++        vld1.32   {d18[0]}, [r8],  r9
++        vld1.32   {d18[1]}, [r4],  r9
++        cmp        r12, #12
++        vld1.32   {d19[0]}, [r8],  r9
++        vld1.32   {d19[1]}, [r4],  r9
++        blt        3f
++        vld1.32   {d20[0]}, [r8],  r9
++        vld1.32   {d20[1]}, [r4],  r9
++        vld1.32   {d21[0]}, [r8],  r9
++        vld1.32   {d21[1]}, [r4],  r9
++        ble        4f
++        vld1.32   {d22[0]}, [r8],  r9
++        vld1.32   {d22[1]}, [r4],  r9
++        vld1.32   {d23[0]}, [r8]
++        vld1.32   {d23[1]}, [r4]
++        b          5f
++2:      vdup.32    q9,  d17[1]
++3:      vdup.32    q10, d19[1]
++4:      vdup.32    q11, d21[1]
++5:      vstm       lr,  { q8-q11}
++1:
++        eors       r7,  r2
++        beq        99f
++
++        lsls       r12, r7,  #AVAIL_S_UR_N_U_C
++        vdup.32    q0,  d31[0]
++        vdup.32    q1,  d31[0]
++        vdup.32    q2,  d31[0]
++        vdup.32    q3,  d31[0]
++        add        lr,  r1,  #(pw << log2_s)
++        vdup.32    q8,  d31[1]
++        vdup.32    q9,  d31[1]
++        vdup.32    q10, d31[1]
++        vdup.32    q11, d31[1]
++        it cs
++        vstmcs     r1,  { q0-q3 }
++        it mi
++        vstmmi     lr,  { q8-q11}
++
++        lsls       r7,  #AVAIL_S_L_N_DL_C
++        vdup.32    q0,  d30[0]
++        vdup.32    q1,  d30[0]
++        vdup.32    q2,  d30[0]
++        vdup.32    q3,  d30[0]
++        add        lr,  r0,  #(pw << log2_s)
++        it mi
++        vstmmi     r0, { q0-q3 }
++        it cs
++        vstmcs     lr, { q0-q3 }
++
++99:
++        pop       {r4-r10, pc}
++endfunc
++
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+new file mode 100644
+index 0000000000..56819ae439
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
+@@ -0,0 +1,920 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++/*
++ * Horizontal & Vertical special cases of angular intra pred
++ *
++ * Split out because:
++ *  Vertical, at least, is relatively common
++ *  Much simpler code than the general angular case
++ *  Luma with size < 32 has extra filtering that doesn't happen anywhere else
++ *
++ * *** Currently luma filtering is mandatory where it occurs, but there are
++ *     cases where it should be turned off (rdpcm & an extension sps flag).
++ *     These don't occur in the standard conformance suite for Main Profile
++ */
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ ff_hevc_rpi_pred_vertical_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.32     {d0[0]}, [r2 :32]   @ Left
++        add         r2, r0, r3
++        vld1.8      {d1[]}, [r1]
++        lsl         r3, #1
++        vdup.8      d4, ip
++        vmov.i8     d2, #128
++        vhsub.u8    d4, d0, d4
++        veor        d1, d2
++        vld1.32     {d0[0]}, [r1 :32]   @ Top
++        vqadd.s8    d1, d4
++        vmov.i64    d3, #0xff
++        vmov        d4, d0
++        veor        d5, d1, d2
++        veor        d1, d1, d2
++        vbit        d0, d1, d3
++        vshr.u64    d5, #8
++        vst1.32     {d0[0]}, [r0], r3
++        vshr.u64    d1, #16
++        vbit        d4, d5, d3
++        vshr.u64    d5, #16
++        vst1.32     {d4[0]}, [r2], r3
++        vbit        d0, d1, d3
++        vst1.32     {d0[0]}, [r0]
++        vbit        d4, d5, d3
++        vst1.32     {d4[0]}, [r2]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {d0}, [r2 :64]      @ Left
++        vmov.i8     d1, #128
++        vld1.8      {d2[]}, [r1]
++        vld1.8      {d3}, [r1 :64]      @ Top
++        vdup.8      d4, ip
++        vhsub.u8    d4, d0, d4
++        veor        d2, d1
++        vmov.i64    d0, #0xff
++        mov         r1, #8
++        vqadd.s8    d2, d4, d2
++        veor        d1, d2, d1
++1:
++        vbit        d3, d1, d0
++        vshr.u64    d1, #8
++        vst1.8      {d3}, [r0 :64], r3
++        subs        r1, #2
++        vbit        d3, d1, d0
++        vshr.u64    d1, #8
++        vst1.8      {d3}, [r0 :64], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {q0}, [r2 :128]     @ Left
++        vdup.8      q1, ip
++        vld1.8      {d4[],d5[]}, [r1]
++        vhsub.u8    q0, q1
++        vmov.i8     q1, #128
++        veor        q2, q1
++        vmov.i64    d16, #0xff
++        vqadd.s8    q0, q2
++        vld1.8      {q3}, [r1 :128]     @ Top
++        mov         r1, #16
++        veor        q0, q1
++        vmov        q1, q3
++        vext.8      q2, q0, q0, #1
++1:
++        vbit        d2, d0, d16
++        vbit        d6, d4, d16
++        vext.8      q0, q0, q0, #2
++        subs        r1, #2
++        vst1.8      {q1}, [r0 :128], r3
++        vext.8      q2, q2, q2, #2
++        vst1.8      {q3}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vert_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
++        vld1.8     {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3
++        lsl         r3,  #1
++        mov         r1,  #16
++1:
++        vst1.8     {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.8     {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
++        vld1.16    {d0 }, [r1  :64]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++
++        vst1.16    {d0 }, [r0  :64], r3
++        vst1.16    {d0 }, [r2  :64], r3
++        vst1.16    {d0 }, [r0  :64]
++        vst1.16    {d0 }, [r2  :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
++        vld1.16    {q0 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++        mov         r1,  #4
++1:
++        vst1.16    {q0 }, [r0  :128], r3
++        subs        r1,  #2
++        vst1.16    {q0 }, [r2  :128], r3
++        vst1.16    {q0 }, [r0  :128], r3
++        vst1.16    {q0 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
++        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #1
++        lsl         r3,  #2
++        mov         r1,  #8
++1:
++        vst1.16    {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.16    {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++@ ? Might be faster as simple arm
++
++function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.32     {d0[0]}, [r1 :32]   @ Top
++        add         r1, r2, #3
++        vld1.8      {d1[]}, [r2]!
++        vdup.8      d2, ip
++        vmov.i8     d3, #128
++        vhsub.u8    d0, d2
++        veor        d1, d3
++        vld1.8      {d2[]}, [r2]!
++        add         ip, r0, r3
++        vqadd.s8    d0, d0, d1
++        lsl         r3, #1
++        vld1.8      {d1[]}, [r2]
++        vld1.8      {d4[]}, [r1]
++        veor        d0, d3
++        vst1.32     {d0[0]}, [r0 :32], r3
++        vst1.32     {d2[0]}, [ip :32], r3
++        vst1.32     {d1[0]}, [r0 :32]
++        vst1.32     {d4[0]}, [ip :32]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {d0}, [r1 :64]      @ Top
++        vmov.i8     d1, #128
++        vld1.8      {d2[]}, [r2]!
++        mov         r1, #8-2
++        vdup.8      d3, ip
++        vhsub.u8    d0, d3
++        veor        d2, d1
++        vqadd.s8    d0, d2
++          vld1.8      {d2[]}, [r2]!
++        veor        d0, d1
++        vst1.8      {d0}, [r0], r3
++1:
++            vld1.8      {d0[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {d2}, [r0 :64], r3
++              vld1.8      {d2[]}, [r2]!
++            vst1.8      {d0}, [r0 :64], r3
++        bne         1b
++
++              vst1.8      {d2}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
++        ldrb        ip, [r2, #-1]       @ Top-left
++        vld1.8      {q0}, [r1 :64]      @ Top
++        mov         r1, #16-2
++        vld1.8      {d4[],d5[]}, [r2]!
++        vdup.8      q3, ip
++        vhsub.u8    q0, q3
++        vmov.i8     q1, #128
++        veor        q2, q1
++        vqadd.s8    q0, q2
++          vld1.8      {d4[],d5[]}, [r2]!
++        veor        q0, q1
++        vst1.8      {q0}, [r0], r3
++1:
++            vld1.8      {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {q2}, [r0 :64], r3
++              vld1.8      {d4[],d5[]}, [r2]!
++            vst1.8      {q0}, [r0 :64], r3
++        bne         1b
++
++              vst1.8      {q2}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
++        vld1.8      {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        mov         r1, #32-2
++          vld1.8      {d2[],d3[]}, [r2]!
++        vst1.8      {q0}, [r0 :128], r3
++        vst1.8      {q0}, [ip :128], r3
++1:
++            vld1.8      {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.8      {q1}, [r0 :128], r3
++          vst1.8      {q1}, [ip :128], r3
++              vld1.8      {d2[],d3[]}, [r2]!
++            vst1.8      {q0}, [r0 :128], r3
++            vst1.8      {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.8      {q1}, [r0 :128]
++              vst1.8      {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
++        add         r1, r2, #2
++        vld1.16     {d0[]}, [r2]
++        add         r2, #4
++        vld1.16     {d1[]}, [r1]
++        add         r1, #4
++        vld1.16     {d2[]}, [r2]
++A       add         r2, r0, r3, lsl #1
++T       lsl         r3, #1
++T       add         r2, r0, r3
++        vld1.16     {d3[]}, [r1]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vst1.16     {d0}, [r0 :64], r3
++        vst1.16     {d1}, [r2 :64], r3
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d3}, [r2 :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        lsl         r3, #1
++          vld1.16     {d2[],d3[]}, [r2]!
++        mov         r1, #8-2
++        vst1.16     {q0}, [r0 :64], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :64], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :64], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :64]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        lsl         r3, #1
++        add         ip, r0, #16
++        mov         r1, #16-2
++          vld1.16     {d2[],d3[]}, [r2]!
++        vst1.16     {q0}, [r0 :128], r3
++        vst1.16     {q0}, [ip :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :128], r3
++          vst1.16     {q1}, [ip :128], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :128]
++              vst1.16     {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ 10 Bit
++@ Has clipping constants so 10-bit only but could easily be macroed up to
++@ 14-bit before we run out of bits
++
++
++@ ff_hevc_rpi_pred_vertical_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {d0}, [r2 :64]      @ Left
++        vmov.i16    d2, #0
++        vld1.16     {d1[]}, [r1]
++T       lsl         r3, #1
++        vdup.16     d4, ip
++        vmov.i16    d3, #0x3ff
++        vld1.16     {d5}, [r1 :64]      @ Top
++        vhsub.u16   d4, d0, d4
++        vmov.i64    d0, #0xffff
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vadd.i16    d1, d1, d4
++        vmov        d6, d5
++        vmax.s16    d1, d1, d2
++        vmin.s16    d2, d1, d3
++        vmin.s16    d1, d1, d3
++        vbit        d5, d1, d0
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vshr.u64    d2, #16
++        vshr.u64    d1, #32
++        vbit        d6, d2, d0
++        vst1.16     {d5}, [r0], r3
++        vshr.u64    d2, #32
++        vst1.16     {d6}, [r2], r3
++        vbit        d5, d1, d0
++        vst1.16     {d5}, [r0]
++        vbit        d6, d2, d0
++        vst1.16     {d6}, [r2]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0}, [r2 :128]     @ Left
++        lsl         r3, #1
++        vdup.16     q1, ip
++        vld1.16     {d4[],d5[]}, [r1]
++        vhsub.u16   q0, q0, q1
++        vmov.i16    q1, #0
++        vadd.i16    q0, q2
++        vmov.i16    q2, #0x3ff
++        vld1.16     {q3}, [r1 :128]     @ Top
++        mov         r1, #8
++        vmax.s16    q0, q1
++        vmov        q1, q3
++        vmin.s16    q0, q2
++        vmov.i64    d16, #0xffff
++        vext.16     q2, q0, q0, #1
++1:
++        vbit        d2, d0, d16
++        vbit        d6, d4, d16
++        vext.16     q0, q0, q0, #2
++        subs        r1, #2
++        vst1.16     {q1}, [r0 :128], r3
++        vext.16     q2, q2, q2, #2
++        vst1.16     {q3}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0-q1}, [r2 :128]  @ Left
++T       lsl         r3, #1
++        vdup.16     q2, ip
++A       add         r2, r0, r3, lsl #1
++T       add         r2, r0, r3
++        vld1.16     {d6[],d7[]}, [r1]
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vhsub.u16   q0, q2
++        vhsub.u16   q1, q2
++        vadd.i16    q0, q3
++        vadd.i16    q1, q3
++        vmov.i16    q2, #0
++        vld1.16     {q8-q9}, [r1 :128]  @ Top
++        mov         r1, #0
++        vmov.i16    q3, #0x3ff
++        vmax.s16    q0, q2
++        vmax.s16    q1, q2
++        vmin.s16    q0, q3
++        vmin.s16    q1, q3
++        vmov        q10, q8
++        vmov        q11, q9
++        vext.16     q2, q0, q1, #1
++        vext.16     q3, q1, q1, #1
++        vmov.i64    d24, #0xffff
++1:
++        vbit        d16, d0, d24
++        vbit        d20, d4, d24
++        vext.16     q0, q0, q0, #2
++        subs        r1, #1<<30
++        vst1.16     {q8-q9}, [r0 :128], r3
++        vext.16     q2, q2, q2, #2
++        vst1.16     {q10-q11}, [r2 :128], r3
++        bne         1b
++1:
++        vbit        d16, d2, d24
++        vbit        d20, d6, d24
++        vext.16     q1, q1, q1, #2
++        subs        r1, #1<<30
++        vst1.16     {q8-q9}, [r0 :128], r3
++        vext.16     q3, q3, q3, #2
++        vst1.16     {q10-q11}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
++        vldm        r1, { q0-q3 }    @ Up
++        lsl         r3, #1
++        mov         r1, #32
++        add         r2, r0, #32
++1:
++        vst1.16     {q0-q1}, [r0 :128], r3
++        subs        r1, #1
++        vst1.16     {q2-q3}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
++        vld1.16    {q0 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #2
++        lsl         r3,  #3
++
++        vst1.16    {q0 }, [r0  :128], r3
++        vst1.16    {q0 }, [r2  :128], r3
++        vst1.16    {q0 }, [r0  :128]
++        vst1.16    {q0 }, [r2  :128]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
++        vld1.16    {q0,  q1 }, [r1  :128]    @ Up
++        add         r2,  r0,  r3,  lsl #2
++        lsl         r3,  #3
++        mov         r1,  #4
++1:
++        vst1.16    {q0,  q1 }, [r0  :128], r3
++        subs        r1,  #1
++        vst1.16    {q0,  q1 }, [r2  :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_vertical_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
++        vldm        r1, { q0-q3 }    @ Up
++        lsl         r3, #2
++        mov         r1, #16
++        add         r2, r0, #32
++1:
++        vst1.16     {q0-q1}, [r0 :128], r3
++        subs        r1, #1
++        vst1.16     {q2-q3}, [r2 :128], r3
++        bne         1b
++
++        bx          lr
++endfunc
++
++@ ff_hevc_rpi_pred_horizontal_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {d0}, [r1 :64]      @ Top
++        vmov.i16    d1, #0
++        vld1.16     {d2[]}, [r2]!
++T       lsl         r3, #1
++        vdup.16     d3, ip
++        vmov.i16    d4, #0x3ff
++        vhsub.u16   d0, d3
++A       add         ip, r0, r3, lsl #1
++T       add         ip, r0, r3
++        vld1.16     {d3[]}, [r2]!
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vadd.i16    d0, d2
++        vld1.16     {d2[]}, [r2]!
++        vmax.s16    d0, d1
++        vld1.16     {d1[]}, [r2]
++        vmin.s16    d0, d4
++        vst1.16     {d0}, [r0 :64], r3
++        vst1.16     {d3}, [ip :64], r3
++        vst1.16     {d2}, [r0 :64]
++        vst1.16     {d1}, [ip :64]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0}, [r1 :128]     @ Top
++        lsl         r3, #1
++        vdup.16     q1, ip
++        mov         r1, #8-2
++        vhsub.u16   q0, q1
++        vld1.16     {d2[],d3[]}, [r2]!
++        vmov.i16    q2, #0
++        vadd.i16    q0, q1
++        vmov.i16    q1, #0x3ff
++        vmax.s16    q0, q2
++          vld1.16     {d4[],d5[]}, [r2]!
++        vmin.s16    q0, q1
++        vst1.16     {q0}, [r0 :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q2}, [r0 :128], r3
++              vld1.16     {d4[],d5[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++        bne         1b
++
++              vst1.16     {q2}, [r0 :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontalal_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
++        ldrh        ip, [r2, #-2]       @ Top-left
++        vld1.16     {q0-q1}, [r1 :128]  @ Top
++        lsl         r3, #1
++        vdup.16     q2, ip
++        add         ip, r0, r3
++        vhsub.u16   q0, q2
++        add         ip, #16
++        vhsub.u16   q1, q2
++        mov         r1, #16-2
++        vld1.16     {d4[],d5[]}, [r2]!
++        vmov.i16    q3, #0
++        vadd.u16    q0, q2
++        vadd.i16    q1, q2
++        vmov.i16    q2, #0x3ff
++        vmax.s16    q0, q3
++        vmax.s16    q1, q3
++          vld1.16     {d6[],d7[]}, [r2]!
++        vmin.s16    q0, q2
++        vmin.s16    q1, q2
++        vst1.16     {q0-q1}, [r0 :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q3}, [r0 :128], r3
++          vst1.16     {q3}, [ip :128], r3
++              vld1.16     {d6[],d7[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q3}, [r0 :128]
++              vst1.16     {q3}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
++        vld1.16     {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        push        {lr}
++        mov         lr, #32
++          vld1.16     {d2[],d3[]}, [r2]!
++        lsl         r3, #1
++        vst1.16     {q0}, [r0 :128], lr
++        sub         r3, #32
++        vst1.16     {q0}, [ip :128], lr
++        mov         r1, #32-2
++        vst1.16     {q0}, [r0 :128], r3
++        vst1.16     {q0}, [ip :128], r3
++1:
++            vld1.16     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.16     {q1}, [r0 :128], lr
++          vst1.16     {q1}, [ip :128], lr
++          vst1.16     {q1}, [r0 :128], r3
++          vst1.16     {q1}, [ip :128], r3
++              vld1.16     {d2[],d3[]}, [r2]!
++            vst1.16     {q0}, [r0 :128], lr
++            vst1.16     {q0}, [ip :128], lr
++            vst1.16     {q0}, [r0 :128], r3
++            vst1.16     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.16     {q1}, [r0 :128], lr
++              vst1.16     {q1}, [ip :128], lr
++              vst1.16     {q1}, [r0 :128]
++              vst1.16     {q1}, [ip :128]
++        pop         {pc}
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
++        add         r1, r2, #4
++        vld1.32     {d0[],d1[]}, [r2]
++        add         r2, #8
++        vld1.32     {d2[],d3[]}, [r1]
++        add         r1, #8
++        vld1.32     {d4[],d5[]}, [r2]
++A       add         r2, r0, r3, lsl #2
++T       lsl         r3, #2
++T       add         r2, r0, r3
++        vld1.32     {d6[],d7[]}, [r1]
++A       lsl         r3, #3
++T       lsl         r3, #1
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q1}, [r2 :128], r3
++        vst1.32     {q2}, [r0 :128]
++        vst1.32     {q3}, [r2 :128]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
++        vld1.32     {d0[],d1[]}, [r2]!
++        lsl         r3, #2
++        add         ip, r0, #16
++        mov         r1, #8-2
++          vld1.32     {d2[],d3[]}, [r2]!
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q0}, [ip :128], r3
++1:
++            vld1.32     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.32     {q1}, [r0 :128], r3
++          vst1.32     {q1}, [ip :128], r3
++              vld1.32     {d2[],d3[]}, [r2]!
++            vst1.32     {q0}, [r0 :128], r3
++            vst1.32     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.32     {q1}, [r0 :128]
++              vst1.32     {q1}, [ip :128]
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
++        vld1.32     {d0[],d1[]}, [r2]!
++        add         ip, r0, #16
++        push        {lr}
++        mov         lr, #32
++          vld1.32     {d2[],d3[]}, [r2]!
++        lsl         r3, #2
++        vst1.32     {q0}, [r0 :128], lr
++        sub         r3, #32
++        vst1.32     {q0}, [ip :128], lr
++        mov         r1, #16-2
++        vst1.32     {q0}, [r0 :128], r3
++        vst1.32     {q0}, [ip :128], r3
++1:
++            vld1.32     {d0[],d1[]}, [r2]!
++        subs        r1, #2
++          vst1.32     {q1}, [r0 :128], lr
++          vst1.32     {q1}, [ip :128], lr
++          vst1.32     {q1}, [r0 :128], r3
++          vst1.32     {q1}, [ip :128], r3
++              vld1.32     {d2[],d3[]}, [r2]!
++            vst1.32     {q0}, [r0 :128], lr
++            vst1.32     {q0}, [ip :128], lr
++            vst1.32     {q0}, [r0 :128], r3
++            vst1.32     {q0}, [ip :128], r3
++        bne         1b
++
++              vst1.32     {q1}, [r0 :128], lr
++              vst1.32     {q1}, [ip :128], lr
++              vst1.32     {q1}, [r0 :128]
++              vst1.32     {q1}, [ip :128]
++        pop         {pc}
++endfunc
++
++
++
+diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+new file mode 100644
+index 0000000000..af8c4c03f0
+--- /dev/null
++++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
+@@ -0,0 +1,1043 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#include "libavutil/arm/asm.S"
++#include "neon.S"
++
++@ Planar intra pred (8.4.4.2.4)
++@
++@ predSamples[ x ][ y ] =
++@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
++@   ( x + 1 ) * p[ nTbS ][ -1 ] +
++@   ( nTbS - 1 - y ) * p[ x ][ -1 ] +
++@   ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
++
++@ All 10-bit functions would work with 9
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_8, export=1
++
++        vld1.8      {d0}, [r1]          @ Top
++        adr         ip, nb_3_0_1_4
++        vld1.8      {d1}, [r2]          @ Left
++        vmov.i64    d2, #0xffffffff
++        vldr        d3, [ip, #8]        @ {1,2,3,4,1,2,3,4}
++        add         r1, r0, r3
++        vdup.32     d4, d0[0]           @ {t0,t1,t2,t3,t0,t1,t2,t3}
++        vdup.8      d0, d0[4]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
++        vdup.8      d5, d1[4]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
++        vdup.8      d6, d1[0]           @ {l0,l0,l0,l0,l0,l0,l0,l0}
++        vshll.u8    q8, d4, #2
++        lsl         r3, #1
++        vsubl.u8    q2, d5, d4
++        vmlal.u8    q8, d0, d3
++        vld1.8      {d0}, [ip]          @ {3,2,1,0,3,2,1,0}
++        vdup.8      d7, d1[1]           @ {l1,l1,l1,l1,l1,l1,l1,l1}
++        vshl.s16    q9, q2, #1
++        vbif        d6, d7, d2          @ {l0,l0,l0,l0,l1,l1,l1,l1}
++        vadd.i16    d16, d4
++        vdup.8      d7, d1[2]           @ {l2,l2,l2,l2,l2,l2,l2,l2}
++        vadd.i16    d17, d18
++        vdup.8      d1, d1[3]           @ {l3,l3,l3,l3,l3,l3,l3,l3}
++        vadd.i16    q2, q8, q9
++        vmlal.u8    q8, d0, d6
++        vbif        d7, d1, d2          @ {l2,l2,l2,l2,l3,l3,l3,l3}
++        vmlal.u8    q2, d0, d7
++        vrshrn.i16  d0, q8, #3
++        vst1.32     d0[0], [r0 :32], r3
++        vst1.32     d0[1], [r1 :32], r3
++        vrshrn.i16  d0, q2, #3
++        vst1.32     d0[0], [r0 :32]
++        vst1.32     d0[1], [r1 :32]
++
++        bx          lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_4_neon_10, export=1
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0}, [r1 :64]      @ Top
++        adr         ip, nbh_3_0_1_4
++        vldr        d2, [r2, #8]        @ Left (lower)
++        vldr        d3, [ip, #8]        @ {1,2,3,4}
++T       lsl         r3, #1
++        vshl.s16    d4, d0, #2
++        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4}
++        vldr        d5, [r2]            @ Left (upper)
++        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4}
++        vldr        d6, [ip]            @ {3,2,1,0}
++        vmla.i16    d4, d3, d1          @ Acc set up
++        vsub.i16    d0, d2, d0          @ Add set up
++        vmov        d7, d6
++        vdup.16     d2, d5[0]
++        vdup.16     d3, d5[1]
++        vdup.16     d16, d5[2]
++        vadd.i16    d18, d0, d4
++        vshl.s16    d0, #1              @ x2
++        vadd.i16    d19, d0, d4
++        vdup.16     d17, d5[3]
++        vadd.i16    d4, d0, d18
++A       add         r1, r0, r3, lsl #1
++T       add         r1, r0, r3
++        vadd.i16    d5, d0, d19
++A       lsl         r3, #2
++T       lsl         r3, #1
++        vmla.i16    q9, q1, q3
++        vmla.i16    q2, q8, q3
++        vrshr.u16   q0, q9, #3
++        vst1.16     {d0}, [r0], r3
++        vrshr.u16   d2, d4, #3
++        vst1.16     {d1}, [r1], r3
++        vrshr.u16   d3, d5, #3
++        vst1.16     {d2}, [r0]
++        vst1.16     {d3}, [r1]
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_8, export=1
++
++        vld1.8      {q0}, [r1]          @ Top
++        adr         ip, nb_7_0_1_8
++        vldr        d2, [r2, #8]        @ Left (lower)
++        mov         r1, #8
++        vldr        d3, [ip, #8]        @ {1,2,3,4,5,6,7,8}
++        vshll.u8    q2, d0, #3
++        vdup.8      d1, d1[0]           @ {t8,t8,t8,t8,t8,t8,t8,t8}
++        vdup.8      d2, d2[0]           @ {l8,l8,l8,l8,l8,l8,l8,l8}
++        vldr        d6, [r2]            @ Left (upper)
++        vmlal.u8    q2, d3, d1
++        vsubl.u8    q0, d2, d0
++        vldr        d7, [ip]            @ {7,6,5,4,3,2,1,0}
++
++@ u8   7..0    [1]  d7
++@ u8  left[y]  [1]  d6
++@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vdup.8      d2, d6[0]
++        vadd.i16    q2, q0
++        vdup.8      d3, d6[1]
++        vadd.i16    q8, q2, q0
++1:
++        vmlal.u8    q2, d7, d2
++        subs        r1, #2
++        vadd.i16    q9, q8, q0
++        vmlal.u8    q8, d7, d3
++        vdup.8      d2, d6[2]
++        vdup.8      d3, d6[3]
++        vrshrn.i16  d20, q2, #4
++        vshr.u64    d6, #16
++        vmov        q2, q9
++        vst1.8      {d20}, [r0], r3
++        vrshrn.i16  d20, q8, #4
++        vadd.i16    q8, q2, q0
++        vst1.8      {d20}, [r0], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_8_neon_10, export=1
++
++        adr         ip, nb_7_0_1_8
++        vld1.16     {q0}, [r1 :128]!    @ Top (left)
++        lsl         r3, #1
++        vld1.16     {q1}, [ip :128]     @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
++        add         ip, r2, #16
++        vld1.16     {d4[],d5[]}, [r1]   @ Top (right)
++        mov         r1, #8-2
++        vshl.s16    q3, q0, #3
++        vmovl.u8    q8, d3              @ {1,2,3,4,5,6,7,8}
++        vld1.16     {d18[],d19[]}, [ip] @ Left (lower)
++        vmla.i16    q3, q8, q2          @ Acc set up
++        vsub.i16    q0, q9, q0          @ Add set up
++        vmovl.u8    q1, d2              @ {7,6,5,4,3,2,1,0}
++        vadd.i16    q2, q3, q0
++
++@ u16  7..0        [1]  q1
++@ u32 left[y]      [1]  [r2]
++@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vld1.16     {d6[],d7[]}, [r2]!
++        vadd.i16    q8, q2, q0
++        vld1.16     {d18[],d19[]}, [r2]!
++        vmla.i16    q2, q1, q3
++        vadd.i16    q3, q8, q0
++        vmla.i16    q8, q1, q9
++1:
++        vrshr.u16   q9, q2, #4
++        subs        r1, #2
++        vmov        q2, q3
++        vrshr.u16   q10, q8, #4
++          vld1.16     {d6[],d7[]}, [r2]!
++        vst1.16     {q9}, [r0 :128], r3
++          vadd.i16    q8, q2, q0
++          vld1.16     {d18[],d19[]}, [r2]!
++          vmla.i16    q2, q1, q3
++          vadd.i16    q3, q8, q0
++          vmla.i16    q8, q1, q9
++        vst1.16     {q10}, [r0 :128], r3
++        bne         1b
++
++        vrshr.u16   q9, q2, #4
++        add         r3, r0
++        vrshr.u16   q10, q8, #4
++        vst1.16     {q9}, [r0 :128]
++        vst1.16     {q10}, [r3 :128]
++
++        bx         lr
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++        .balign 64
++
++nb_31_0_1_32:
++        .byte   31, 30, 29, 28, 27, 26, 25, 24
++        .byte   23, 22, 21, 20, 19, 18, 17, 16
++nb_15_0_1_16:
++        .byte   15, 14, 13, 12, 11, 10,  9,  8
++        .byte    7,  6,  5,  4,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  5,  6,  7,  8
++        .byte    9, 10, 11, 12, 13, 14, 15, 16
++        .byte   17, 18, 19, 20, 21, 22, 23, 24
++        .byte   25, 26, 27, 28, 29, 30, 31, 32
++
++        @ should be back on a 64-byte boundary here
++
++        @ These could be extracted from the above array, but separate out
++        @ out for better (16 byte) alignment
++nb_3_0_1_4:
++        .byte    3,  2,  1,  0,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  1,  2,  3,  4
++nb_7_0_1_8:
++        .byte    7,  6,  5,  4,  3,  2,  1,  0
++        .byte    1,  2,  3,  4,  5,  6,  7,  8
++nbh_3_0_1_4:
++        .short   3,  2,  1,  0,  1,  2,  3,  4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_8, export=1
++
++        adr         ip, nb_15_0_1_16 + 16
++        vld1.8      {q0}, [r1 :128]!    @ Top (left)
++        add         r2, #16
++        vld1.8      {q1}, [ip: 128]     @ {1,2,3...16}
++        vld1.8      {d4[]}, [r1]        @ Top (right)
++        sub         ip, #16
++        vshll.u8    q3, d0, #4
++        mov         r1, #16
++        vshll.u8    q8, d1, #4
++        vld1.8      {d5[]}, [r2]        @ Left (lower)
++        sub         r2, #16
++        vmlal.u8    q3, d2, d4
++        vmlal.u8    q8, d3, d4          @ Acc set up
++        vsubl.u8    q1, d5, d0
++        vsubl.u8    q0, d5, d1          @ Add set up
++        vld1.8      {q2}, [ip :128]     @ {15,14,13...0}
++
++@ u8  15..0    [1]  q2
++@ u8  left[y]  [1]  [r2]
++@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q3, q1
++        vadd.i16    q8, q0
++1:
++        vadd.i16    q10, q3, q1
++        subs        r1, #2
++        vld1.8      {d18[]}, [r2]!
++        vadd.i16    q11, q8, q0
++        vld1.8      {d19[]}, [r2]!
++        vmlal.u8    q3, d4, d18
++        vmlal.u8    q8, d5, d18
++        vadd.i16    q12, q10, q1
++        vmlal.u8    q10, d4, d19
++        vadd.i16    q13, q11, q0
++        vmlal.u8    q11, d5, d19
++        vrshrn.u16  d18, q3, #5
++        vrshrn.u16  d19, q8, #5
++        vmov        q3, q12
++        vst1.8      {q9}, [r0 :128], r3
++        vrshrn.u16  d18, q10, #5
++        vrshrn.u16  d19, q11, #5
++        vmov        q8, q13
++        vst1.8      {q9}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_16_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        adr         ip, nb_15_0_1_16 + 16
++        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
++        add         r2, #32
++        vld1.8      {q2}, [ip :128]     @ {1,2,3...16}
++        lsl         r3, #1
++        vld1.16     {d6[],d7[]}, [r1]   @ Top (right)
++        sub         ip, #16
++        vmovl.u8    q8, d4
++        mov         r1, #16
++        vshl.i16    q9, q0, #4
++        vmovl.u8    q2, d5
++        vshl.i16    q10, q1, #4
++        vld1.16     {d22[],d23[]}, [r2] @ Left (lower)
++        sub         r2, #32
++        vld1.8      {q12}, [ip]         @ {15,14,13...0}
++        vmla.i16    q9, q8, q3
++        vmla.i16    q10, q2, q3         @ Acc set up
++        vsub.i16    q0, q11, q0
++        vsub.i16    q1, q11, q1         @ Add set up
++        vadd.i16    q2, q9, q0
++        vadd.i16    q3, q10, q1
++        vmovl.u8    q8, d24
++        vmovl.u8    q9, d25
++
++@ u16  15..0       [2]  q8,q9
++@ u32 left[y]      [2]  [r2]
++@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++        vadd.i16    q10, q2, q0
++        subs        r1, #2
++        vld1.16     {d24[],d25[]}, [r2]!
++        vadd.i16    q11, q3, q1
++        vld1.16     {d28[],d29[]}, [r2]!
++        vmla.i16    q2, q8, q12
++        vmla.i16    q3, q9, q12
++        vadd.i16    q12, q10, q0
++        vmla.i16    q10, q8, q14
++        vadd.i16    q13, q11, q1
++        vmla.i16    q11, q9, q14
++        vrshr.u16   q14, q2, #5
++        vrshr.u16   q15, q3, #5
++        vmov        q2, q12
++        vst1.16     {q14-q15}, [r0 :128], r3
++        vrshr.u16   q14, q10, #5
++        vrshr.u16   q15, q11, #5
++        vmov        q3, q13
++        vst1.16     {q14-q15}, [r0 :128], r3
++        bne         1b
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_8, export=1
++
++        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
++        adr         ip, nb_31_0_1_32 + 32
++        vpush       {d8-d12}
++        vld1.8      {q2-q3}, [ip :128]  @ {1,2,3...32}
++        add         r2, #32
++        vld1.8      {d8[]}, [r1]        @ Top (right)
++        sub         ip, #32
++        vshll.u8    q8, d0, #5
++        mov         r1, #32
++        vld1.8      {d9[]}, [r2]        @ Left (lower)
++        sub         r2, #32
++        vshll.u8    q9, d1, #5
++        vshll.u8    q10, d2, #5
++        vshll.u8    q11, d3, #5
++        vmlal.u8    q8, d4, d8
++        vsubl.u8    q12, d9, d0
++        vmlal.u8    q9, d5, d8
++        vsubl.u8    q13, d9, d1
++        vmlal.u8    q10, d6, d8
++        vsubl.u8    q14, d9, d2
++        vmlal.u8    q11, d7, d8         @ Acc set up
++        vsubl.u8    q15, d9, d3         @ Add set up
++        vadd.i16    q8, q12
++        vadd.i16    q9, q13
++        vadd.i16    q10, q14
++        vadd.i16    q11, q15
++        vld1.8      {q4-q5}, [ip :128]  @ {31,30,29...0}
++
++@ u8  31..0    [2]  q4,q5
++@ u8  left[y]  [2]  [r2]
++@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
++
++        vld1.8      {d12[]}, [r2]!
++        vadd.i16    q0, q8, q12
++        b           2f
++1:
++          vld1.8      {d12[]}, [r2]!
++        vrshrn.u16  d3, q1, #6
++        vrshrn.u16  d2, q0, #6
++          vadd.i16    q0, q8, q12
++        vrshrn.u16  d4, q2, #6
++        vrshrn.u16  d5, q3, #6
++        vst1.8      {q1-q2}, [r0 :128], r3
++2:        vadd.i16    q1, q9, q13
++          subs        r1, #2
++          vadd.i16    q2, q10, q14
++          vadd.i16    q3, q11, q15
++          vmlal.u8    q8, d8, d12
++          vmlal.u8    q9, d9, d12
++          vmlal.u8    q10, d10, d12
++          vmlal.u8    q11, d11, d12
++            vld1.8      {d12[]}, [r2]!
++          vrshrn.u16  d19, q9, #6
++          vrshrn.u16  d18, q8, #6
++            vadd.i16    q8, q0, q12
++          vrshrn.u16  d20, q10, #6
++          vrshrn.u16  d21, q11, #6
++          vst1.8      {q9-q10}, [r0 :128], r3
++            vadd.i16    q9, q1, q13
++            vadd.i16    q10, q2, q14
++            vadd.i16    q11, q3, q15
++            vmlal.u8    q0, d8, d12
++            vmlal.u8    q1, d9, d12
++            vmlal.u8    q2, d10, d12
++            vmlal.u8    q3, d11, d12
++
++        bne         1b
++
++        vpop        {d8-d12}
++
++        vrshrn.u16  d3, q1, #6
++        vrshrn.u16  d2, q0, #6
++        vrshrn.u16  d4, q2, #6
++        vrshrn.u16  d5, q3, #6
++        vst1.8      {q1-q2}, [r0 :128]
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_32_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_32_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
++        adr         ip, nb_31_0_1_32 + 32
++        vpush       {q4-q7}
++        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
++        add         r2, #64
++        vld1.8      {q14-q15}, [ip :128] @ {1,2,3...32}
++T       lsl         r3, #1
++        vld1.16     {d8[],d9[]}, [r1]    @ Top (right)
++        sub         ip, #32
++        vmovl.u8    q12, d28
++        mov         r1, #32
++        vmovl.u8    q13, d29
++        vld1.8      {q6-q7}, [ip :128]   @ {31,30,29...0}
++        vmovl.u8    q14, d30
++        vmovl.u8    q15, d31
++        vld1.16     {d10[],d11[]}, [r2]  @ Left (lower)
++        sub         r2, #64
++        vshl.i16    q8, q0, #5
++        vshl.i16    q9, q1, #5
++        vshl.i16    q10, q2, #5
++        vshl.i16    q11, q3, #5
++        vmla.i16    q8, q12, q4
++        vsub.i16    q0, q5, q0
++        vmla.i16    q9, q13, q4
++        vsub.i16    q1, q5, q1
++        vmla.i16    q10, q14, q4
++        vmov.u16    ip, d0[0]
++        vsub.i16    q2, q5, q2
++        vmla.i16    q11, q15, q4         @ Acc set up
++        vsub.i16    q3, q5, q3           @ Add set up
++        vadd.i16    q8, q0
++        vadd.i16    q9, q1
++        vadd.i16    q10, q2
++        vadd.i16    q11, q3
++        vmovl.u8    q4, d12
++        vmovl.u8    q5, d13
++        vmovl.u8    q6, d14
++        vmovl.u8    q7, d15
++
++@ u16 31..0    [4]  q4-q7
++@ u16 left[y]  [4]  [r2]
++@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q12, q8, q0
++A       sub         r0, r0, r3, lsl #1
++T       sub         r0, r3
++1:
++        vld1.16     {d0[0]}, [r2]!
++A       add         r0, r0, r3, lsl #1
++T       add         r0, r3
++        vadd.i16    q13, q9, q1
++        subs        r1, #2
++        vadd.i16    q14, q10, q2
++        vadd.i16    q15, q11, q3
++        vmla.i16    q8, q4, d0[0]
++        vmla.i16    q9, q5, d0[0]
++        vmla.i16    q10, q6, d0[0]
++        vmla.i16    q11, q7, d0[0]
++        vmov.16     d0[0], ip
++        vrshr.u16   q8, #6
++        vrshr.u16   q9, #6
++        vrshr.u16   q10, #6
++        vrshr.u16   q11, #6
++        vstm        r0, {q8-q11}
++        vadd.i16    q8, q12, q0
++A       add         r0, r0, r3, lsl #1
++T       add         r0, r3
++        vld1.16     {d0[0]}, [r2]!
++        vadd.i16    q9, q13, q1
++        vadd.i16    q10, q14, q2
++        vadd.i16    q11, q15, q3
++        vmla.i16    q12, q4, d0[0]
++        vmla.i16    q13, q5, d0[0]
++        vmla.i16    q14, q6, d0[0]
++        vmla.i16    q15, q7, d0[0]
++        vmov.16     d0[0], ip
++        vrshr.u16   q12, #6
++        vrshr.u16   q13, #6
++        vrshr.u16   q14, #6
++        vrshr.u16   q15, #6
++        vstm        r0, {q12-q15}
++        vadd.i16    q12, q8, q0
++        bne         1b
++
++        vpop        {q4-q7}
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
++
++        vld1.8      {q0}, [r1]          @ Top
++        adr         ip, nbx2_3_0_1_4
++        vldr        d2, [r2, #8]        @ Left (lower)
++        mov         r1, #4
++        vldr        d3, [ip, #8]        @ {1,1,2,2,3,3,4,4}
++        lsl         r3, #1
++        vshll.u8    q2, d0, #2
++        vdup.16     d1, d1[0]           @ {t4,t4,t4,t4,t4,t4,t4,t4}
++        vdup.16     d2, d2[0]           @ {l4,l4,l4,l4,l4,l4,l4,l4}
++        vldr        d6, [r2]            @ Left (upper)
++        vmlal.u8    q2, d3, d1
++        vsubl.u8    q0, d2, d0
++        vldr        d7, [ip]            @ {3,3,2,2,1,1,0,0}
++
++@ u8   3..0    [1]  d7
++@ u8  left[y]  [1]  d6
++@ u16 acc      [2]  q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vdup.16     d2, d6[0]
++        vadd.i16    q2, q0
++        vdup.16     d3, d6[1]
++        vadd.i16    q8, q2, q0
++1:
++        vmlal.u8    q2, d7, d2
++        subs        r1, #2
++        vadd.i16    q9, q8, q0
++        vmlal.u8    q8, d7, d3
++        vdup.16     d2, d6[2]
++        vdup.16     d3, d6[3]
++        vrshrn.i16  d20, q2, #3
++        vmov        q2, q9
++        vst1.8      {d20}, [r0], r3
++        vrshrn.i16  d20, q8, #3
++        vadd.i16    q8, q2, q0
++        vst1.8      {d20}, [r0], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_4_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
++
++        adr         ip, nbx2_3_0_1_4
++        vld1.16     {q0}, [r1 :128]!    @ Top (left)
++        lsl         r3, #2
++        vld1.16     {q1}, [ip :128]     @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
++        add         ip, r2, #16
++        vld1.32     {d4[],d5[]}, [r1]   @ Top (right)
++        vshl.s16    q3, q0, #2
++        vmovl.u8    q8, d3              @ {1,1,2,2,3,3,4,4}
++        vld1.32     {d18[],d19[]}, [ip] @ Left (lower)
++        vmla.i16    q3, q8, q2          @ Acc set up
++        vsub.i16    q0, q9, q0          @ Add set up
++        vmovl.u8    q1, d2              @ {3,3,2,2,1,1,0,0}
++        vadd.i16    q2, q3, q0
++
++@ u16  3..0        [1]  q1
++@ u32 left[y]      [1]  [r2]
++@ u16 acc          [1]  q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [1]  q0 = p[-1][nTbs] - p[x][-1]
++
++        vld1.32     {d6[],d7[]}, [r2]!
++        vadd.i16    q8, q2, q0
++        vld1.32     {d18[],d19[]}, [r2]!
++        vmla.i16    q2, q1, q3
++        vadd.i16    q3, q8, q0
++        vmla.i16    q8, q1, q9
++
++        vrshr.u16   q9, q2, #3
++        vmov        q2, q3
++        vrshr.u16   q10, q8, #3
++          vld1.32     {d6[],d7[]}, [r2]!
++        vst1.16     {q9}, [r0 :128], r3
++          vadd.i16    q8, q2, q0
++          vld1.32     {d18[],d19[]}, [r2]!
++          vmla.i16    q2, q1, q3
++          vadd.i16    q3, q8, q0
++          vmla.i16    q8, q1, q9
++        vst1.16     {q10}, [r0 :128], r3
++
++          vrshr.u16   q9, q2, #3
++          add         r3, r0
++          vrshr.u16   q10, q8, #3
++          vst1.16     {q9}, [r0 :128]
++          vst1.16     {q10}, [r3 :128]
++
++          bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
++
++        adr         ip, nbx2_7_0_1_8 + 16
++        vld1.8      {q0}, [r1 :128]!    @ Top (left)
++        add         r2, #16
++        vld1.8      {q1}, [ip: 128]     @ {1,1,2,2,3,3...8,8}
++        lsl         r3, #1
++        vld1.16     {d4[]}, [r1]        @ Top (right)
++        sub         ip, #16
++        vshll.u8    q3, d0, #3
++        mov         r1, #8
++        vshll.u8    q8, d1, #3
++        vld1.16     {d5[]}, [r2]        @ Left (lower)
++        sub         r2, #16
++        vmlal.u8    q3, d2, d4
++        vmlal.u8    q8, d3, d4          @ Acc set up
++        vsubl.u8    q1, d5, d0
++        vsubl.u8    q0, d5, d1          @ Add set up
++        vld1.8      {q2}, [ip :128]     @ {7,7,6,6,5,5...0,0}
++
++@ u8  7..0     [1]  q2
++@ u8  left[y]  [1]  [r2]
++@ u16 acc      [2]  q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [2]  q1,q0 = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q3, q1
++        vadd.i16    q8, q0
++1:
++        vadd.i16    q10, q3, q1
++        subs        r1, #2
++        vld1.16     {d18[]}, [r2]!
++        vadd.i16    q11, q8, q0
++        vld1.16     {d19[]}, [r2]!
++        vmlal.u8    q3, d4, d18
++        vmlal.u8    q8, d5, d18
++        vadd.i16    q12, q10, q1
++        vmlal.u8    q10, d4, d19
++        vadd.i16    q13, q11, q0
++        vmlal.u8    q11, d5, d19
++        vrshrn.u16  d18, q3, #4
++        vrshrn.u16  d19, q8, #4
++        vmov        q3, q12
++        vst1.8      {q9}, [r0 :128], r3
++        vrshrn.u16  d18, q10, #4
++        vrshrn.u16  d19, q11, #4
++        vmov        q8, q13
++        vst1.8      {q9}, [r0 :128], r3
++        bne         1b
++
++        bx          lr
++
++endfunc
++
++
++@------------------------------------------------------------------------------
++@
++@ Data - has to be in two lumps to ensure we can always reach using adr
++
++        .balign 64
++
++nbx2_15_0_1_16:
++        .byte   15, 15, 14, 14, 13, 13, 12, 12
++        .byte   11, 11, 10, 10,  9,  9,  8,  8
++nbx2_7_0_1_8:
++        .byte    7,  7,  6,  6,  5,  5,  4,  4
++        .byte    3,  3,  2,  2,  1,  1,  0,  0
++        .byte    1,  1,  2,  2,  3,  3,  4,  4
++        .byte    5,  5,  6,  6,  7,  7,  8,  8
++        .byte    9,  9, 10, 10, 11, 11, 12, 12
++        .byte   13, 13, 14, 14, 15, 15, 16, 16
++
++        @ should be back on a 64-byte boundary here
++
++nbx2_3_0_1_4:
++        .byte    3,  3,  2,  2,  1,  1,  0,  0
++        .byte    1,  1,  2,  2,  3,  3,  4,  4
++
++@------------------------------------------------------------------------------
++
++
++@ ff_hevc_rpi_pred_planar_c_8_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        adr         ip, nbx2_7_0_1_8 + 16
++        vld1.16     {q0-q1}, [r1 :128]! @ Top (left)
++        add         r2, #32
++        vld1.8      {q2}, [ip :128]     @ {1,1,2,2,3,3...8,8}
++        lsl         r3, #2
++        vld1.32     {d6[],d7[]}, [r1]   @ Top (right)
++        sub         ip, #16
++        vmovl.u8    q8, d4
++        mov         r1, #8
++        vshl.i16    q9, q0, #3
++        vmovl.u8    q2, d5
++        vshl.i16    q10, q1, #3
++        vld1.32     {d22[],d23[]}, [r2] @ Left (lower)
++        sub         r2, #32
++        vld1.8      {q12}, [ip]         @ {7,7,6,6,5,5...0,0}
++        vmla.i16    q9, q8, q3
++        vmla.i16    q10, q2, q3         @ Acc set up
++        vsub.i16    q0, q11, q0
++        vsub.i16    q1, q11, q1         @ Add set up
++        vadd.i16    q2, q9, q0
++        vadd.i16    q3, q10, q1
++        vmovl.u8    q8, d24
++        vmovl.u8    q9, d25
++
++@ u16  7..0        [2]  q8,q9
++@ u32 left[y]      [2]  [r2]
++@ u16 acc          [2]  q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add          [2]  q0,q1 = p[-1][nTbs] - p[x][-1]
++
++1:
++        vadd.i16    q10, q2, q0
++        subs        r1, #2
++        vld1.32     {d24[],d25[]}, [r2]!
++        vadd.i16    q11, q3, q1
++        vld1.32     {d28[],d29[]}, [r2]!
++        vmla.i16    q2, q8, q12
++        vmla.i16    q3, q9, q12
++        vadd.i16    q12, q10, q0
++        vmla.i16    q10, q8, q14
++        vadd.i16    q13, q11, q1
++        vmla.i16    q11, q9, q14
++        vrshr.u16   q14, q2, #4
++        vrshr.u16   q15, q3, #4
++        vmov        q2, q12
++        vst1.16     {q14-q15}, [r0 :128], r3
++        vrshr.u16   q14, q10, #4
++        vrshr.u16   q15, q11, #4
++        vmov        q3, q13
++        vst1.16     {q14-q15}, [r0 :128], r3
++        bne         1b
++
++        bx         lr
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_8
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
++
++        vld1.8      {q0-q1}, [r1 :128]! @ Top (left)
++        adr         ip, nbx2_15_0_1_16 + 32
++        vpush       {d8-d12}
++        vld1.8      {q2-q3}, [ip :128]  @ {1,1,2,2,3,3...16,16}
++        add         r2, #32
++        vld1.16     {d8[]}, [r1]        @ Top (right)
++        sub         ip, #32
++        vshll.u8    q8, d0, #4
++        mov         r1, #16
++        vld1.16     {d9[]}, [r2]        @ Left (lower)
++        sub         r2, #32
++        vshll.u8    q9, d1, #4
++        lsl         r3, #1
++        vshll.u8    q10, d2, #4
++        vshll.u8    q11, d3, #4
++        vmlal.u8    q8, d4, d8
++        vsubl.u8    q12, d9, d0
++        vmlal.u8    q9, d5, d8
++        vsubl.u8    q13, d9, d1
++        vmlal.u8    q10, d6, d8
++        vsubl.u8    q14, d9, d2
++        vmlal.u8    q11, d7, d8         @ Acc set up
++        vsubl.u8    q15, d9, d3         @ Add set up
++        vadd.i16    q8, q12
++        vadd.i16    q9, q13
++        vadd.i16    q10, q14
++        vadd.i16    q11, q15
++        vld1.8      {q4-q5}, [ip :128]  @ {15,15,14,14,13,13...0,0}
++
++@ u8  15..0    [2]  q4,q5
++@ u8  left[y]  [2]  [r2]
++@ u16 acc      [4]  q8-q11  = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q12-q15 = p[-1][nTbs] - p[x][-1]
++
++        vld1.16     {d12[]}, [r2]!
++        vadd.i16    q0, q8, q12
++        b           2f
++1:
++          vld1.16     {d12[]}, [r2]!
++        vrshrn.u16  d3, q1, #5
++        vrshrn.u16  d2, q0, #5
++          vadd.i16    q0, q8, q12
++        vrshrn.u16  d4, q2, #5
++        vrshrn.u16  d5, q3, #5
++        vst1.8      {q1-q2}, [r0 :128], r3
++2:        vadd.i16    q1, q9, q13
++          subs        r1, #2
++          vadd.i16    q2, q10, q14
++          vadd.i16    q3, q11, q15
++          vmlal.u8    q8, d8, d12
++          vmlal.u8    q9, d9, d12
++          vmlal.u8    q10, d10, d12
++          vmlal.u8    q11, d11, d12
++            vld1.16     {d12[]}, [r2]!
++          vrshrn.u16  d19, q9, #5
++          vrshrn.u16  d18, q8, #5
++            vadd.i16    q8, q0, q12
++          vrshrn.u16  d20, q10, #5
++          vrshrn.u16  d21, q11, #5
++          vst1.8      {q9-q10}, [r0 :128], r3
++            vadd.i16    q9, q1, q13
++            vadd.i16    q10, q2, q14
++            vadd.i16    q11, q3, q15
++            vmlal.u8    q0, d8, d12
++            vmlal.u8    q1, d9, d12
++            vmlal.u8    q2, d10, d12
++            vmlal.u8    q3, d11, d12
++
++        bne         1b
++
++        vpop        {d8-d12}
++
++        vrshrn.u16  d3, q1, #5
++        vrshrn.u16  d2, q0, #5
++        vrshrn.u16  d4, q2, #5
++        vrshrn.u16  d5, q3, #5
++        vst1.8      {q1-q2}, [r0 :128]
++
++        bx          lr
++
++endfunc
++
++
++@ ff_hevc_rpi_pred_planar_c_16_neon_10
++@       uint8_t *_src,          [r0]
++@       const uint8_t *_top,    [r1]
++@       const uint8_t *_left,   [r2]
++@       ptrdiff_t stride)       [r3]
++
++function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
++
++        @ Load from bytes & expand later - at the very least this uses less
++        @ memory than having a short table
++        vld1.16     {q0-q1}, [r1 :128]!  @ Top (left)
++        adr         ip, nbx2_15_0_1_16 + 32
++        vpush       {q4-q7}
++        vld1.16     {q2-q3}, [r1 :128]!  @ Top (centre)
++        add         r2, #64
++        vld1.8      {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
++T       lsl         r3, #2
++        vld1.32     {d8[],d9[]}, [r1]    @ Top (right)
++        sub         ip, #32
++        vmovl.u8    q12, d28
++        mov         r1, #16
++        vmovl.u8    q13, d29
++        vld1.8      {q6-q7}, [ip :128]   @ {15,15,14,14,13,13...0,0}
++        vmovl.u8    q14, d30
++        vmovl.u8    q15, d31
++        vld1.32     {d10[],d11[]}, [r2]  @ Left (lower)
++        sub         r2, #64
++        vshl.i16    q8, q0, #4
++        vshl.i16    q9, q1, #4
++        vshl.i16    q10, q2, #4
++        vshl.i16    q11, q3, #4
++        vmla.i16    q8, q12, q4
++        vsub.i16    q0, q5, q0
++        vmla.i16    q9, q13, q4
++        vpush       {q0}
++        vsub.i16    q1, q5, q1
++        vmla.i16    q10, q14, q4
++        vsub.i16    q2, q5, q2
++        vmla.i16    q11, q15, q4         @ Acc set up
++        vsub.i16    q3, q5, q3           @ Add set up
++        vadd.i16    q8, q0
++        vadd.i16    q9, q1
++        vadd.i16    q10, q2
++        vadd.i16    q11, q3
++        vmovl.u8    q4, d12
++        vmovl.u8    q5, d13
++        vmovl.u8    q6, d14
++        vmovl.u8    q7, d15
++
++@ u16 31..0    [4]  q4-q7
++@ u16 left[y]  [4]  [r2]
++@ u16 acc      [4]  q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
++@ u16 add      [4]  q0-q3  = p[-1][nTbs] - p[x][-1]
++
++        vadd.i16    q12, q8, q0
++A       sub         r0, r0, r3, lsl #2
++T       sub         r0, r3
++1:
++        vld1.32     {d0[],d1[]}, [r2]!
++A       add         r0, r0, r3, lsl #2
++T       add         r0, r3
++        vadd.i16    q13, q9, q1
++        subs        r1, #2
++        vadd.i16    q14, q10, q2
++        vadd.i16    q15, q11, q3
++        vmla.i16    q8, q4, q0
++        vmla.i16    q9, q5, q0
++        vmla.i16    q10, q6, q0
++        vmla.i16    q11, q7, q0
++        vld1.16     {q0}, [sp]
++        vrshr.u16   q8, #5
++        vrshr.u16   q9, #5
++        vrshr.u16   q10, #5
++        vrshr.u16   q11, #5
++        vstm        r0, {q8-q11}
++        vadd.i16    q8, q12, q0
++A       add         r0, r0, r3, lsl #2
++T       add         r0, r3
++        vld1.32     {d0[],d1[]}, [r2]!
++        vadd.i16    q9, q13, q1
++        vadd.i16    q10, q14, q2
++        vadd.i16    q11, q15, q3
++        vmla.i16    q12, q4, q0
++        vmla.i16    q13, q5, q0
++        vmla.i16    q14, q6, q0
++        vmla.i16    q15, q7, q0
++        vld1.16     {q0}, [sp]
++        vrshr.u16   q12, #5
++        vrshr.u16   q13, #5
++        vrshr.u16   q14, #5
++        vrshr.u16   q15, #5
++        vstm        r0, {q12-q15}
++        vadd.i16    q12, q8, q0
++        bne         1b
++
++        vpop        {q3-q7}
++        bx          lr
++
++endfunc
+diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
+index 2cca784f5a..48cb816b70 100644
+--- a/libavcodec/arm/vc1dsp_init_neon.c
++++ b/libavcodec/arm/vc1dsp_init_neon.c
+@@ -19,6 +19,7 @@
+ #include <stdint.h>
+ 
+ #include "libavutil/attributes.h"
++#include "libavutil/intreadwrite.h"
+ #include "libavcodec/vc1dsp.h"
+ #include "vc1dsp.h"
+ 
+@@ -32,6 +33,13 @@ void ff_vc1_inv_trans_4x8_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
+ void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
+ 
++void ff_vc1_v_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter4_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter8_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_v_loop_filter16_neon(uint8_t *src, int stride, int pq);
++void ff_vc1_h_loop_filter16_neon(uint8_t *src, int stride, int pq);
++
+ void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
+                            ptrdiff_t line_size, int rnd);
+ 
+@@ -77,6 +85,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+ void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
+                                 int h, int x, int y);
+ 
++int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
++
++static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
++{
++    /* Dealing with starting and stopping, and removing escape bytes, are
++     * comparatively less time-sensitive, so are more clearly expressed using
++     * a C wrapper around the assembly inner loop. Note that we assume a
++     * little-endian machine that supports unaligned loads. */
++    int dsize = 0;
++    while (size >= 4)
++    {
++        int found = 0;
++        while (!found && (((uintptr_t) dst) & 7) && size >= 4)
++        {
++            found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++            if (!found)
++            {
++                *dst++ = *src++;
++                --size;
++                ++dsize;
++            }
++        }
++        if (!found)
++        {
++            int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
++            dst += skip;
++            src += skip;
++            size -= skip;
++            dsize += skip;
++            while (!found && size >= 4)
++            {
++                found = (AV_RL32(src) &~ 0x03000000) == 0x00030000;
++                if (!found)
++                {
++                    *dst++ = *src++;
++                    --size;
++                    ++dsize;
++                }
++            }
++        }
++        if (found)
++        {
++            *dst++ = *src++;
++            *dst++ = *src++;
++            ++src;
++            size -= 3;
++            dsize += 2;
++        }
++    }
++    while (size > 0)
++    {
++        *dst++ = *src++;
++        --size;
++        ++dsize;
++    }
++    return dsize;
++}
++
+ #define FN_ASSIGN(X, Y) \
+     dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
+     dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
+@@ -92,6 +158,13 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
+     dsp->vc1_inv_trans_8x4_dc = ff_vc1_inv_trans_8x4_dc_neon;
+     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
+ 
++    dsp->vc1_v_loop_filter4  = ff_vc1_v_loop_filter4_neon;
++    dsp->vc1_h_loop_filter4  = ff_vc1_h_loop_filter4_neon;
++    dsp->vc1_v_loop_filter8  = ff_vc1_v_loop_filter8_neon;
++    dsp->vc1_h_loop_filter8  = ff_vc1_h_loop_filter8_neon;
++    dsp->vc1_v_loop_filter16 = ff_vc1_v_loop_filter16_neon;
++    dsp->vc1_h_loop_filter16 = ff_vc1_h_loop_filter16_neon;
++
+     dsp->put_vc1_mspel_pixels_tab[1][ 0] = ff_put_pixels8x8_neon;
+     FN_ASSIGN(1, 0);
+     FN_ASSIGN(2, 0);
+@@ -116,4 +189,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
+     dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
+     dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
++
++    dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
+ }
+diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
+index 93f043bf08..96014fbebc 100644
+--- a/libavcodec/arm/vc1dsp_neon.S
++++ b/libavcodec/arm/vc1dsp_neon.S
+@@ -1161,3 +1161,764 @@ function ff_vc1_inv_trans_4x4_dc_neon, export=1
+         vst1.32         {d1[1]},  [r0,:32]
+         bx              lr
+ endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter4_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1[0]}, [r0], r1       @ P5
++        vld1.32         {d2[0]}, [r3], r1       @ P1
++        vld1.32         {d3[0]}, [r3], r1       @ P2
++        vld1.32         {d4[0]}, [r0], r1       @ P6
++        vld1.32         {d5[0]}, [r3], r1       @ P3
++        vld1.32         {d6[0]}, [r0], r1       @ P7
++        vld1.32         {d7[0]}, [r3]           @ P4
++        vld1.32         {d16[0]}, [r0]          @ P8
++        vshll.u8        q9, d1, #1              @ 2*P5
++        vdup.16         d17, r2                 @ pq
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vmovl.u8        q11, d3                 @ P2
++        vmovl.u8        q1, d4                  @ P6
++        vmovl.u8        q12, d5                 @ P3
++        vmls.i16        d20, d22, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q11, d6                 @ P7
++        vmls.i16        d18, d2, d0[1]          @ 2*P5-5*P6
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmovl.u8        q3, d7                  @ P4
++        vmla.i16        d18, d22, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q11, d16                @ P8
++        vmla.u16        d20, d24, d0[1]         @ 2*P1-5*P2+5*P3
++        vmovl.u8        q12, d1                 @ P5
++        vmls.u16        d4, d6, d0[1]           @ 2*P3-5*P4
++        vmls.u16        d18, d22, d0[0]         @ 2*P5-5*P6+5*P7-2*P8
++        vsub.i16        d1, d6, d24             @ P4-P5
++        vmls.i16        d20, d6, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vmla.i16        d4, d24, d0[1]          @ 2*P3-5*P4+5*P5
++        vmls.i16        d4, d2, d0[0]           @ 2*P3-5*P4+5*P5-2*P6
++        vabs.s16        d2, d1
++        vrshr.s16       d3, d18, #3
++        vrshr.s16       d5, d20, #3
++        vshr.s16        d2, d2, #1              @ clip
++        vrshr.s16       d4, d4, #3
++        vabs.s16        d3, d3                  @ a2
++        vshr.s16        d1, d1, #8              @ clip_sign
++        vabs.s16        d5, d5                  @ a1
++        vceq.i16        d7, d2, #0              @ test clip == 0
++        vabs.s16        d16, d4                 @ a0
++        vshr.s16        d4, d4, #8              @ a0_sign
++        vcge.s16        d18, d5, d3             @ test a1 >= a2
++        vcge.s16        d17, d16, d17           @ test a0 >= pq
++        vbsl            d18, d3, d5             @ a3
++        vsub.i16        d1, d1, d4              @ clip_sign - a0_sign
++        vorr            d3, d7, d17             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d4, d16, d18            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d5, d18, d16            @ test a3 >= a0
++        vmul.i16        d0, d4, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d4, d3, d5              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d4, d0, d2
++        tst             r0, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d4, d2, d0              @ FFMIN(d, clip)
++        vbic            d0, d4, d3              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        d6, d0, d1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        d24, d0, d1             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q12
++        vst1.32         {d0[0]}, [r3], r1
++        vst1.32         {d1[0]}, [r3]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 4 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter4_neon, export=1
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3]
++        vdup.16         d1, r2                  @ pq
++        vtrn.8          q1, q2
++        vtrn.16         d2, d3                  @ P1, P5, P3, P7
++        vtrn.16         d4, d5                  @ P2, P6, P4, P8
++        vshll.u8        q3, d2, #1              @ 2*P1, 2*P5
++        vmovl.u8        q8, d4                  @ P2, P6
++        vmovl.u8        q9, d3                  @ P3, P7
++        vmovl.u8        q2, d5                  @ P4, P8
++        vmls.i16        q3, q8, d0[1]           @ 2*P1-5*P2, 2*P5-5*P6
++        vshll.u8        q10, d3, #1             @ 2*P3, 2*P7
++        vmovl.u8        q1, d2                  @ P1, P5
++        vmla.i16        q3, q9, d0[1]           @ 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
++        vmls.i16        q3, q2, d0[0]           @ 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
++        vmov            d2, d3                  @ needs to be in an even-numbered vector for when we come to narrow it later
++        vmls.i16        d20, d4, d0[1]          @ 2*P3-5*P4
++        vmla.i16        d20, d3, d0[1]          @ 2*P3-5*P4+5*P5
++        vsub.i16        d3, d4, d2              @ P4-P5
++        vmls.i16        d20, d17, d0[0]         @ 2*P3-5*P4+5*P5-2*P6
++        vrshr.s16       q3, q3, #3
++        vabs.s16        d5, d3
++        vshr.s16        d3, d3, #8              @ clip_sign
++        vrshr.s16       d16, d20, #3
++        vabs.s16        q3, q3                  @ a1, a2
++        vshr.s16        d5, d5, #1              @ clip
++        vabs.s16        d17, d16                @ a0
++        vceq.i16        d18, d5, #0             @ test clip == 0
++        vshr.s16        d16, d16, #8            @ a0_sign
++        vcge.s16        d19, d6, d7             @ test a1 >= a2
++        vcge.s16        d1, d17, d1             @ test a0 >= pq
++        vsub.i16        d16, d3, d16            @ clip_sign - a0_sign
++        vbsl            d19, d7, d6             @ a3
++        vorr            d1, d18, d1             @ test clip == 0 || a0 >= pq
++        vqsub.u16       d3, d17, d19            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        d6, d19, d17            @ test a3 >= a0    @
++        vmul.i16        d0, d3, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            d3, d1, d6              @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d3[1]               @ move to gp reg
++        vshr.u16        d0, d0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vcge.s16        d3, d0, d5
++        tst             r2, #1
++        bne             1f                      @ none of the 4 pixel pairs should be updated if this one is not filtered
++        vbsl            d3, d5, d0              @ FFMIN(d, clip)
++        vbic            d0, d3, d1              @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        d2, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        d4, d0, d16             @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q1
++        vqmovun.s16     d0, q2
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter8_neon, export=1
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.32         {d1}, [r0 :64], r1      @ P5
++        vld1.32         {d2}, [r3 :64], r1      @ P1
++        vld1.32         {d3}, [r3 :64], r1      @ P2
++        vld1.32         {d4}, [r0 :64], r1      @ P6
++        vld1.32         {d5}, [r3 :64], r1      @ P3
++        vld1.32         {d6}, [r0 :64], r1      @ P7
++        vshll.u8        q8, d1, #1              @ 2*P5
++        vshll.u8        q9, d2, #1              @ 2*P1
++        vld1.32         {d7}, [r3 :64]          @ P4
++        vmovl.u8        q1, d3                  @ P2
++        vld1.32         {d20}, [r0 :64]         @ P8
++        vmovl.u8        q11, d4                 @ P6
++        vdup.16         q12, r2                 @ pq
++        vmovl.u8        q13, d5                 @ P3
++        vmls.i16        q9, q1, d0[1]           @ 2*P1-5*P2
++        vmovl.u8        q1, d6                  @ P7
++        vshll.u8        q2, d5, #1              @ 2*P3
++        vmls.i16        q8, q11, d0[1]          @ 2*P5-5*P6
++        vmovl.u8        q3, d7                  @ P4
++        vmovl.u8        q10, d20                @ P8
++        vmla.i16        q8, q1, d0[1]           @ 2*P5-5*P6+5*P7
++        vmovl.u8        q1, d1                  @ P5
++        vmla.i16        q9, q13, d0[1]          @ 2*P1-5*P2+5*P3
++        vsub.i16        q13, q3, q1             @ P4-P5
++        vmls.i16        q2, q3, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q8, q10, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q10, q13
++        vshr.s16        q13, q13, #8            @ clip_sign
++        vmls.i16        q9, q3, d0[0]           @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q10, q10, #1            @ clip
++        vmla.i16        q2, q1, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q11, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q11, q10, #0            @ test clip == 0
++        vrshr.s16       q9, q9, #3
++        vabs.s16        q8, q8                  @ a2
++        vabs.s16        q9, q9                  @ a1
++        vrshr.s16       q2, q2, #3
++        vcge.s16        q14, q9, q8             @ test a1 >= a2
++        vabs.s16        q15, q2                 @ a0
++        vshr.s16        q2, q2, #8              @ a0_sign
++        vbsl            q14, q8, q9             @ a3
++        vcge.s16        q8, q15, q12            @ test a0 >= pq
++        vsub.i16        q2, q13, q2             @ clip_sign - a0_sign
++        vqsub.u16       q9, q15, q14            @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q12, q14, q15           @ test a3 >= a0
++        vorr            q8, q11, q8             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q9, d0[1]           @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q9, q8, q12             @ test clip == 0 || a0 >= pq || a3 >= a0
++        vshl.i64        q11, q9, #16
++        vmov.32         r0, d18[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r2, d19[1]
++        vshr.s64        q9, q11, #48
++        vcge.s16        q11, q0, q10
++        vorr            q8, q8, q9
++        and             r0, r0, r2
++        vbsl            q11, q10, q0            @ FFMIN(d, clip)
++        tst             r0, #1
++        bne             1f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q11, q8             @ set each d to zero if it should not be filtered
++        vmls.i16        q3, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vmla.i16        q1, q0, q2              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vqmovun.s16     d0, q3
++        vqmovun.s16     d1, q1
++        vst1.32         {d0}, [r3 :64], r1
++        vst1.32         {d1}, [r3 :64]
++1:      bx              lr
++endfunc
++
++.align  5
++.Lcoeffs:
++.quad   0x00050002
++
++@ VC-1 in-loop deblocking filter for 8 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter8_neon, export=1
++        push            {lr}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d4}, [r3], r1
++        add             r12, r0, r1, lsl #2
++        vld1.32         {d3}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d7}, [r3], r1
++        vld1.32         {d17}, [r3]
++        vtrn.8          q1, q2                  @ P1[0], P1[1], P3[0]... P1[2], P1[3], P3[2]... P2[0], P2[1], P4[0]... P2[2], P2[3], P4[2]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.16         d2, d3                  @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vtrn.16         d4, d5                  @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vtrn.8          q3, q8                  @ P1[4], P1[5], P3[4]... P1[6], P1[7], P3[6]... P2[4], P2[5], P4[4]... P2[6], P2[7], P4[6]...
++        vtrn.16         d6, d7                  @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[5], P3[7], P7[4]...
++        vtrn.16         d16, d17                @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.32         d2, d6                  @ P1, P5
++        vtrn.32         d4, d16                 @ P2, P6
++        vtrn.32         d3, d7                  @ P3, P7
++        vtrn.32         d5, d17                 @ P4, P8
++        vshll.u8        q10, d2, #1             @ 2*P1
++        vshll.u8        q11, d6, #1             @ 2*P5
++        vmovl.u8        q12, d4                 @ P2
++        vmovl.u8        q13, d16                @ P6
++        vmovl.u8        q14, d3                 @ P3
++        vmls.i16        q10, q12, d0[1]         @ 2*P1-5*P2
++        vmovl.u8        q12, d7                 @ P7
++        vshll.u8        q1, d3, #1              @ 2*P3
++        vmls.i16        q11, q13, d0[1]         @ 2*P5-5*P6
++        vmovl.u8        q2, d5                  @ P4
++        vmovl.u8        q8, d17                 @ P8
++        vmla.i16        q11, q12, d0[1]         @ 2*P5-5*P6+5*P7
++        vmovl.u8        q3, d6                  @ P5
++        vmla.i16        q10, q14, d0[1]         @ 2*P1-5*P2+5*P3
++        vsub.i16        q12, q2, q3             @ P4-P5
++        vmls.i16        q1, q2, d0[1]           @ 2*P3-5*P4
++        vmls.i16        q11, q8, d0[0]          @ 2*P5-5*P6+5*P7-2*P8
++        vabs.s16        q8, q12
++        vshr.s16        q12, q12, #8            @ clip_sign
++        vmls.i16        q10, q2, d0[0]          @ 2*P1-5*P2+5*P3-2*P4
++        vshr.s16        q8, q8, #1              @ clip
++        vmla.i16        q1, q3, d0[1]           @ 2*P3-5*P4+5*P5
++        vrshr.s16       q11, q11, #3
++        vmls.i16        q1, q13, d0[0]          @ 2*P3-5*P4+5*P5-2*P6
++        vceq.i16        q13, q8, #0             @ test clip == 0
++        vrshr.s16       q10, q10, #3
++        vabs.s16        q11, q11                @ a2
++        vabs.s16        q10, q10                @ a1
++        vrshr.s16       q1, q1, #3
++        vcge.s16        q14, q10, q11           @ test a1 >= a2
++        vabs.s16        q15, q1                 @ a0
++        vshr.s16        q1, q1, #8              @ a0_sign
++        vbsl            q14, q11, q10           @ a3
++        vcge.s16        q9, q15, q9             @ test a0 >= pq
++        vsub.i16        q1, q12, q1             @ clip_sign - a0_sign
++        vqsub.u16       q10, q15, q14           @ a0 >= a3 ? a0-a3 : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q11, q14, q15           @ test a3 >= a0
++        vorr            q9, q13, q9             @ test clip == 0 || a0 >= pq
++        vmul.i16        q0, q10, d0[1]          @ a0 >= a3 ? 5*(a0-a3) : 0
++        vorr            q10, q9, q11            @ test clip == 0 || a0 >= pq || a3 >= a0
++        vmov.32         r2, d20[1]              @ move to gp reg
++        vshr.u16        q0, q0, #3              @ a0 >= a3 ? (5*(a0-a3))>>3 : 0
++        vmov.32         r3, d21[1]
++        vcge.s16        q10, q0, q8
++        and             r14, r2, r3
++        vbsl            q10, q8, q0             @ FFMIN(d, clip)
++        tst             r14, #1
++        bne             2f                      @ none of the 8 pixel pairs should be updated in this case
++        vbic            q0, q10, q9             @ set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmla.i16        q3, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
++        vmls.i16        q2, q0, q1              @ invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
++        vqmovun.s16     d1, q3
++        vqmovun.s16     d0, q2
++        tst             r2, #1
++        bne             1f                      @ none of the first 4 pixel pairs should be updated if so
++        vst2.8          {d0[0], d1[0]}, [r0], r1
++        vst2.8          {d0[1], d1[1]}, [r0], r1
++        vst2.8          {d0[2], d1[2]}, [r0], r1
++        vst2.8          {d0[3], d1[3]}, [r0]
++1:      tst             r3, #1
++        bne             2f                      @ none of the second 4 pixel pairs should be updated if so
++        vst2.8          {d0[4], d1[4]}, [r12], r1
++        vst2.8          {d0[5], d1[5]}, [r12], r1
++        vst2.8          {d0[6], d1[6]}, [r12], r1
++        vst2.8          {d0[7], d1[7]}, [r12]
++2:      pop             {pc}
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of vertically-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of lower block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_v_loop_filter16_neon, export=1
++        vpush           {d8-d15}
++        sub             r3, r0, r1, lsl #2
++        vldr            d0, .Lcoeffs
++        vld1.64         {q1}, [r0 :128], r1     @ P5
++        vld1.64         {q2}, [r3 :128], r1     @ P1
++        vld1.64         {q3}, [r3 :128], r1     @ P2
++        vld1.64         {q4}, [r0 :128], r1     @ P6
++        vld1.64         {q5}, [r3 :128], r1     @ P3
++        vld1.64         {q6}, [r0 :128], r1     @ P7
++        vshll.u8        q7, d2, #1              @ 2*P5[0..7]
++        vshll.u8        q8, d4, #1              @ 2*P1[0..7]
++        vld1.64         {q9}, [r3 :128]         @ P4
++        vmovl.u8        q10, d6                 @ P2[0..7]
++        vld1.64         {q11}, [r0 :128]        @ P8
++        vmovl.u8        q12, d8                 @ P6[0..7]
++        vdup.16         q13, r2                 @ pq
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vmls.i16        q8, q10, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vshll.u8        q10, d3, #1             @ 2*P5[8..15]
++        vmovl.u8        q3, d7                  @ P2[8..15]
++        vmls.i16        q7, q12, d0[1]          @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q4, d9                  @ P6[8..15]
++        vmovl.u8        q14, d10                @ P3[0..7]
++        vmovl.u8        q15, d12                @ P7[0..7]
++        vmls.i16        q2, q3, d0[1]           @ 2*P1[8..15]-5*P2[8..15]
++        vshll.u8        q3, d10, #1             @ 2*P3[0..7]
++        vmls.i16        q10, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q6, d13                 @ P7[8..15]
++        vmla.i16        q8, q14, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q14, d18                @ P4[0..7]
++        vmovl.u8        q9, d19                 @ P4[8..15]
++        vmla.i16        q7, q15, d0[1]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vmovl.u8        q15, d11                @ P3[8..15]
++        vshll.u8        q5, d11, #1             @ 2*P3[8..15]
++        vmls.i16        q3, q14, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmla.i16        q2, q15, d0[1]          @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q15, d22                @ P8[0..7]
++        vmovl.u8        q11, d23                @ P8[8..15]
++        vmla.i16        q10, q6, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q6, d2                  @ P5[0..7]
++        vmovl.u8        q1, d3                  @ P5[8..15]
++        vmls.i16        q5, q9, d0[1]           @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q8, q14, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q7, q15, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q15, q14, q6            @ P4[0..7]-P5[0..7]
++        vmla.i16        q3, q6, d0[1]           @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q8, q8, #3
++        vmls.i16        q2, q9, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q7, q7, #3
++        vmls.i16        q10, q11, d0[0]         @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vabs.s16        q11, q15
++        vabs.s16        q8, q8                  @ a1[0..7]
++        vmla.i16        q5, q1, d0[1]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vshr.s16        q15, q15, #8            @ clip_sign[0..7]
++        vrshr.s16       q2, q2, #3
++        vmls.i16        q3, q12, d0[0]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q7, q7                  @ a2[0..7]
++        vrshr.s16       q10, q10, #3
++        vsub.i16        q12, q9, q1             @ P4[8..15]-P5[8..15]
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vmls.i16        q5, q4, d0[0]           @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vcge.s16        q4, q8, q7              @ test a1[0..7] >= a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vrshr.s16       q3, q3, #3
++        vabs.s16        q10, q10                @ a2[8..15]
++        vbsl            q4, q7, q8              @ a3[0..7]
++        vabs.s16        q7, q12
++        vshr.s16        q8, q12, #8             @ clip_sign[8..15]
++        vrshr.s16       q5, q5, #3
++        vcge.s16        q12, q2, q10            @ test a1[8..15] >= a2[8.15]
++        vshr.s16        q7, q7, #1              @ clip[8..15]
++        vbsl            q12, q10, q2            @ a3[8..15]
++        vabs.s16        q2, q3                  @ a0[0..7]
++        vceq.i16        q10, q11, #0            @ test clip[0..7] == 0
++        vshr.s16        q3, q3, #8              @ a0_sign[0..7]
++        vsub.i16        q3, q15, q3             @ clip_sign[0..7] - a0_sign[0..7]
++        vcge.s16        q15, q2, q13            @ test a0[0..7] >= pq
++        vorr            q10, q10, q15           @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vqsub.u16       q15, q2, q4             @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q4, q2              @ test a3[0..7] >= a0[0..7]
++        vabs.s16        q4, q5                  @ a0[8..15]
++        vshr.s16        q5, q5, #8              @ a0_sign[8..15]
++        vmul.i16        q15, q15, d0[1]         @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vcge.s16        q13, q4, q13            @ test a0[8..15] >= pq
++        vorr            q2, q10, q2             @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vsub.i16        q5, q8, q5              @ clip_sign[8..15] - a0_sign[8..15]
++        vceq.i16        q8, q7, #0              @ test clip[8..15] == 0
++        vshr.u16        q15, q15, #3            @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vmov.32         r0, d4[1]               @ move to gp reg
++        vorr            q8, q8, q13             @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vqsub.u16       q13, q4, q12            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vmov.32         r2, d5[1]
++        vcge.s16        q4, q12, q4             @ test a3[8..15] >= a0[8..15]
++        vshl.i64        q2, q2, #16
++        vcge.s16        q12, q15, q11
++        vmul.i16        q0, q13, d0[1]          @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vorr            q4, q8, q4              @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vshr.s64        q2, q2, #48
++        and             r0, r0, r2
++        vbsl            q12, q11, q15           @ FFMIN(d[0..7], clip[0..7])
++        vshl.i64        q11, q4, #16
++        vmov.32         r2, d8[1]
++        vshr.u16        q0, q0, #3              @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q10, q2
++        vmov.32         r12, d9[1]
++        vshr.s64        q4, q11, #48
++        vcge.s16        q10, q0, q7
++        vbic            q2, q12, q2             @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vorr            q4, q8, q4
++        and             r2, r2, r12
++        vbsl            q10, q7, q0             @ FFMIN(d[8..15], clip[8..15])
++        vmls.i16        q14, q2, q3             @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4[0..7]
++        and             r0, r0, r2
++        vbic            q0, q10, q4             @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        tst             r0, #1
++        bne             1f                      @ none of the 16 pixel pairs should be updated in this case
++        vmla.i16        q6, q2, q3              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5[0..7]
++        vmls.i16        q9, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4[8..15]
++        vqmovun.s16     d4, q14
++        vmla.i16        q1, q0, q5              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5[8..15]
++        vqmovun.s16     d0, q6
++        vqmovun.s16     d5, q9
++        vqmovun.s16     d1, q1
++        vst1.64         {q2}, [r3 :128], r1
++        vst1.64         {q0}, [r3 :128]
++1:      vpop            {d8-d15}
++        bx              lr
++endfunc
++
++@ VC-1 in-loop deblocking filter for 16 pixel pairs at boundary of horizontally-neighbouring blocks
++@ On entry:
++@   r0 -> top-left pel of right block
++@   r1 = row stride, bytes
++@   r2 = PQUANT bitstream parameter
++function ff_vc1_h_loop_filter16_neon, export=1
++        push            {r4-r6,lr}
++        vpush           {d8-d15}
++        sub             r3, r0, #4              @ where to start reading
++        vldr            d0, .Lcoeffs
++        vld1.32         {d2}, [r3], r1          @ P1[0], P2[0]...
++        sub             r0, r0, #1              @ where to start writing
++        vld1.32         {d3}, [r3], r1
++        add             r4, r0, r1, lsl #2
++        vld1.32         {d10}, [r3], r1
++        vld1.32         {d11}, [r3], r1
++        vld1.32         {d16}, [r3], r1
++        vld1.32         {d4}, [r3], r1
++        vld1.32         {d8}, [r3], r1
++        vtrn.8          d2, d3                  @ P1[0], P1[1], P3[0]... P2[0], P2[1], P4[0]...
++        vld1.32         {d14}, [r3], r1
++        vld1.32         {d5}, [r3], r1
++        vtrn.8          d10, d11                @ P1[2], P1[3], P3[2]... P2[2], P2[3], P4[2]...
++        vld1.32         {d6}, [r3], r1
++        vld1.32         {d12}, [r3], r1
++        vtrn.8          d16, d4                 @ P1[4], P1[5], P3[4]... P2[4], P2[5], P4[4]...
++        vld1.32         {d13}, [r3], r1
++        vtrn.16         d2, d10                 @ P1[0], P1[1], P1[2], P1[3], P5[0]... P3[0], P3[1], P3[2], P3[3], P7[0]...
++        vld1.32         {d1}, [r3], r1
++        vtrn.8          d8, d14                 @ P1[6], P1[7], P3[6]... P2[6], P2[7], P4[6]...
++        vld1.32         {d7}, [r3], r1
++        vtrn.16         d3, d11                 @ P2[0], P2[1], P2[2], P2[3], P6[0]... P4[0], P4[1], P4[2], P4[3], P8[0]...
++        vld1.32         {d9}, [r3], r1
++        vtrn.8          d5, d6                  @ P1[8], P1[9], P3[8]... P2[8], P2[9], P4[8]...
++        vld1.32         {d15}, [r3]
++        vtrn.16         d16, d8                 @ P1[4], P1[5], P1[6], P1[7], P5[4]... P3[4], P3[5], P3[6], P3[7], P7[4]...
++        vtrn.16         d4, d14                 @ P2[4], P2[5], P2[6], P2[7], P6[4]... P4[4], P4[5], P4[6], P4[7], P8[4]...
++        vtrn.8          d12, d13                @ P1[10], P1[11], P3[10]... P2[10], P2[11], P4[10]...
++        vdup.16         q9, r2                  @ pq
++        vtrn.8          d1, d7                  @ P1[12], P1[13], P3[12]... P2[12], P2[13], P4[12]...
++        vtrn.32         d2, d16                 @ P1[0..7], P5[0..7]
++        vtrn.16         d5, d12                 @ P1[8], P1[7], P1[10], P1[11], P5[8]... P3[8], P3[9], P3[10], P3[11], P7[8]...
++        vtrn.16         d6, d13                 @ P2[8], P2[7], P2[10], P2[11], P6[8]... P4[8], P4[9], P4[10], P4[11], P8[8]...
++        vtrn.8          d9, d15                 @ P1[14], P1[15], P3[14]... P2[14], P2[15], P4[14]...
++        vtrn.32         d3, d4                  @ P2[0..7], P6[0..7]
++        vshll.u8        q10, d2, #1             @ 2*P1[0..7]
++        vtrn.32         d10, d8                 @ P3[0..7], P7[0..7]
++        vshll.u8        q11, d16, #1            @ 2*P5[0..7]
++        vtrn.32         d11, d14                @ P4[0..7], P8[0..7]
++        vtrn.16         d1, d9                  @ P1[12], P1[13], P1[14], P1[15], P5[12]... P3[12], P3[13], P3[14], P3[15], P7[12]...
++        vtrn.16         d7, d15                 @ P2[12], P2[13], P2[14], P2[15], P6[12]... P4[12], P4[13], P4[14], P4[15], P8[12]...
++        vmovl.u8        q1, d3                  @ P2[0..7]
++        vmovl.u8        q12, d4                 @ P6[0..7]
++        vtrn.32         d5, d1                  @ P1[8..15], P5[8..15]
++        vtrn.32         d6, d7                  @ P2[8..15], P6[8..15]
++        vtrn.32         d12, d9                 @ P3[8..15], P7[8..15]
++        vtrn.32         d13, d15                @ P4[8..15], P8[8..15]
++        vmls.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]
++        vmovl.u8        q1, d10                 @ P3[0..7]
++        vshll.u8        q2, d5, #1              @ 2*P1[8..15]
++        vshll.u8        q13, d1, #1             @ 2*P5[8..15]
++        vmls.i16        q11, q12, d0[1]         @ 2*P5[0..7]-5*P6[0..7]
++        vmovl.u8        q14, d6                 @ P2[8..15]
++        vmovl.u8        q3, d7                  @ P6[8..15]
++        vmovl.u8        q15, d8                 @ P7[0..7]
++        vmla.i16        q10, q1, d0[1]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]
++        vmovl.u8        q1, d12                 @ P3[8..15]
++        vmls.i16        q2, q14, d0[1]          @ 2*P1[8..15]-5*P2[8..15]
++        vmovl.u8        q4, d9                  @ P7[8..15]
++        vshll.u8        q14, d10, #1            @ 2*P3[0..7]
++        vmls.i16        q13, q3, d0[1]          @ 2*P5[8..15]-5*P6[8..15]
++        vmovl.u8        q5, d11                 @ P4[0..7]
++        vmla.i16        q11, q15, d0[1]         @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
++        vshll.u8        q15, d12, #1            @ 2*P3[8..15]
++        vmovl.u8        q6, d13                 @ P4[8..15]
++        vmla.i16        q2, q1, d0[1]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
++        vmovl.u8        q1, d14                 @ P8[0..7]
++        vmovl.u8        q7, d15                 @ P8[8..15]
++        vmla.i16        q13, q4, d0[1]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]
++        vmovl.u8        q4, d16                 @ P5[0..7]
++        vmovl.u8        q8, d1                  @ P5[8..15]
++        vmls.i16        q14, q5, d0[1]          @ 2*P3[0..7]-5*P4[0..7]
++        vmls.i16        q15, q6, d0[1]          @ 2*P3[8..15]-5*P4[8..15]
++        vmls.i16        q10, q5, d0[0]          @ 2*P1[0..7]-5*P2[0..7]+5*P3[0..7]-2*P4[0..7]
++        vmls.i16        q11, q1, d0[0]          @ 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
++        vsub.i16        q1, q5, q4              @ P4[0..7]-P5[0..7]
++        vmls.i16        q2, q6, d0[0]           @ 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
++        vrshr.s16       q10, q10, #3
++        vmls.i16        q13, q7, d0[0]          @ 2*P5[8..15]-5*P6[8..15]+5*P7[8..15]-2*P8[8..15]
++        vsub.i16        q7, q6, q8              @ P4[8..15]-P5[8..15]
++        vrshr.s16       q11, q11, #3
++        vmla.s16        q14, q4, d0[1]          @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
++        vrshr.s16       q2, q2, #3
++        vmla.i16        q15, q8, d0[1]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
++        vabs.s16        q10, q10                @ a1[0..7]
++        vrshr.s16       q13, q13, #3
++        vmls.i16        q15, q3, d0[0]          @ 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
++        vabs.s16        q3, q11                 @ a2[0..7]
++        vabs.s16        q2, q2                  @ a1[8..15]
++        vmls.i16        q14, q12, d0[0]         @ 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]-2*P6[0..7]
++        vabs.s16        q11, q1
++        vabs.s16        q12, q13                @ a2[8..15]
++        vcge.s16        q13, q10, q3            @ test a1[0..7] >= a2[0..7]
++        vshr.s16        q1, q1, #8              @ clip_sign[0..7]
++        vrshr.s16       q15, q15, #3
++        vshr.s16        q11, q11, #1            @ clip[0..7]
++        vrshr.s16       q14, q14, #3
++        vbsl            q13, q3, q10            @ a3[0..7]
++        vcge.s16        q3, q2, q12             @ test a1[8..15] >= a2[8.15]
++        vabs.s16        q10, q15                @ a0[8..15]
++        vshr.s16        q15, q15, #8            @ a0_sign[8..15]
++        vbsl            q3, q12, q2             @ a3[8..15]
++        vabs.s16        q2, q14                 @ a0[0..7]
++        vabs.s16        q12, q7
++        vshr.s16        q7, q7, #8              @ clip_sign[8..15]
++        vshr.s16        q14, q14, #8            @ a0_sign[0..7]
++        vshr.s16        q12, q12, #1            @ clip[8..15]
++        vsub.i16        q7, q7, q15             @ clip_sign[8..15] - a0_sign[8..15]
++        vqsub.u16       q15, q10, q3            @ a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q3, q3, q10             @ test a3[8..15] >= a0[8..15]
++        vcge.s16        q10, q10, q9            @ test a0[8..15] >= pq
++        vcge.s16        q9, q2, q9              @ test a0[0..7] >= pq
++        vsub.i16        q1, q1, q14             @ clip_sign[0..7] - a0_sign[0..7]
++        vqsub.u16       q14, q2, q13            @ a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0  (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
++        vcge.s16        q2, q13, q2             @ test a3[0..7] >= a0[0..7]
++        vmul.i16        q13, q15, d0[1]         @ a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
++        vceq.i16        q15, q11, #0            @ test clip[0..7] == 0
++        vmul.i16        q0, q14, d0[1]          @ a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
++        vorr            q9, q15, q9             @ test clip[0..7] == 0 || a0[0..7] >= pq
++        vceq.i16        q14, q12, #0            @ test clip[8..15] == 0
++        vshr.u16        q13, q13, #3            @ a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
++        vorr            q2, q9, q2              @ test clip[0..7] == 0 || a0[0..7] >= pq || a3[0..7] >= a0[0..7]
++        vshr.u16        q0, q0, #3              @ a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
++        vorr            q10, q14, q10           @ test clip[8..15] == 0 || a0[8..15] >= pq
++        vcge.s16        q14, q13, q12
++        vmov.32         r2, d4[1]               @ move to gp reg
++        vorr            q3, q10, q3             @ test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
++        vmov.32         r3, d5[1]
++        vcge.s16        q2, q0, q11
++        vbsl            q14, q12, q13           @ FFMIN(d[8..15], clip[8..15])
++        vbsl            q2, q11, q0             @ FFMIN(d[0..7], clip[0..7])
++        vmov.32         r5, d6[1]
++        vbic            q0, q14, q10            @ set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmov.32         r6, d7[1]
++        and             r12, r2, r3
++        vbic            q2, q2, q9              @ set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
++        vmls.i16        q6, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P4
++        vmls.i16        q5, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P4
++        and             r14, r5, r6
++        vmla.i16        q4, q2, q1              @ invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
++        and             r12, r12, r14
++        vqmovun.s16     d4, q6
++        vmla.i16        q8, q0, q7              @ invert d[8..15] depending on clip_sign[8..15] & a0_sign[8..15], or zero it if they match, and accumulate into P5
++        tst             r12, #1
++        bne             4f                      @ none of the 16 pixel pairs should be updated in this case
++        vqmovun.s16     d2, q5
++        vqmovun.s16     d3, q4
++        vqmovun.s16     d5, q8
++        tst             r2, #1
++        bne             1f
++        vst2.8          {d2[0], d3[0]}, [r0], r1
++        vst2.8          {d2[1], d3[1]}, [r0], r1
++        vst2.8          {d2[2], d3[2]}, [r0], r1
++        vst2.8          {d2[3], d3[3]}, [r0]
++1:      add             r0, r4, r1, lsl #2
++        tst             r3, #1
++        bne             2f
++        vst2.8          {d2[4], d3[4]}, [r4], r1
++        vst2.8          {d2[5], d3[5]}, [r4], r1
++        vst2.8          {d2[6], d3[6]}, [r4], r1
++        vst2.8          {d2[7], d3[7]}, [r4]
++2:      add             r4, r0, r1, lsl #2
++        tst             r5, #1
++        bne             3f
++        vst2.8          {d4[0], d5[0]}, [r0], r1
++        vst2.8          {d4[1], d5[1]}, [r0], r1
++        vst2.8          {d4[2], d5[2]}, [r0], r1
++        vst2.8          {d4[3], d5[3]}, [r0]
++3:      tst             r6, #1
++        bne             4f
++        vst2.8          {d4[4], d5[4]}, [r4], r1
++        vst2.8          {d4[5], d5[5]}, [r4], r1
++        vst2.8          {d4[6], d5[6]}, [r4], r1
++        vst2.8          {d4[7], d5[7]}, [r4]
++4:      vpop            {d8-d15}
++        pop             {r4-r6,pc}
++endfunc
++
++@ Copy at most the specified number of bytes from source to destination buffer,
++@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
++@ On entry:
++@   r0 -> source buffer
++@   r1 = max number of bytes to copy
++@   r2 -> destination buffer, optimally 8-byte aligned
++@ On exit:
++@   r0 = number of bytes not copied
++function ff_vc1_unescape_buffer_helper_neon, export=1
++        @ Offset by 48 to screen out cases that are too short for us to handle,
++        @ and also make it easy to test for loop termination, or to determine
++        @ whether we need an odd number of half-iterations of the loop.
++        subs    r1, r1, #48
++        bmi     90f
++
++        @ Set up useful constants
++        vmov.i32        q0, #0x3000000
++        vmov.i32        q1, #0x30000
++
++        tst             r1, #16
++        bne             1f
++
++          vld1.8          {q8, q9}, [r0]!
++          vbic            q12, q8, q0
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++          add             r1, r1, #16
++          b               3f
++
++1:      vld1.8          {q10, q11}, [r0]!
++        vbic            q12, q10, q0
++        vext.8          q13, q10, q11, #1
++        vext.8          q14, q10, q11, #2
++        vext.8          q15, q10, q11, #3
++        veor            q12, q12, q1
++        vbic            q13, q13, q0
++        vbic            q14, q14, q0
++        vbic            q15, q15, q0
++        vceq.i32        q12, q12, #0
++        veor            q13, q13, q1
++        veor            q14, q14, q1
++        veor            q15, q15, q1
++        vceq.i32        q13, q13, #0
++        vceq.i32        q14, q14, #0
++        vceq.i32        q15, q15, #0
++        @ Drop through...
++2:        vmov            q8, q11
++          vld1.8          {q9}, [r0]!
++        vorr            q13, q12, q13
++        vorr            q15, q14, q15
++          vbic            q12, q8, q0
++        vorr            q3, q13, q15
++          vext.8          q13, q8, q9, #1
++          vext.8          q14, q8, q9, #2
++          vext.8          q15, q8, q9, #3
++          veor            q12, q12, q1
++        vorr            d6, d6, d7
++          vbic            q13, q13, q0
++          vbic            q14, q14, q0
++          vbic            q15, q15, q0
++          vceq.i32        q12, q12, #0
++        vmov            r3, r12, d6
++          veor            q13, q13, q1
++          veor            q14, q14, q1
++          veor            q15, q15, q1
++          vceq.i32        q13, q13, #0
++          vceq.i32        q14, q14, #0
++          vceq.i32        q15, q15, #0
++        orrs            r3, r3, r12
++        bne             90f
++        vst1.64         {q10}, [r2]!
++3:          vmov            q10, q9
++            vld1.8          {q11}, [r0]!
++          vorr            q13, q12, q13
++          vorr            q15, q14, q15
++            vbic            q12, q10, q0
++          vorr            q3, q13, q15
++            vext.8          q13, q10, q11, #1
++            vext.8          q14, q10, q11, #2
++            vext.8          q15, q10, q11, #3
++            veor            q12, q12, q1
++          vorr            d6, d6, d7
++            vbic            q13, q13, q0
++            vbic            q14, q14, q0
++            vbic            q15, q15, q0
++            vceq.i32        q12, q12, #0
++          vmov            r3, r12, d6
++            veor            q13, q13, q1
++            veor            q14, q14, q1
++            veor            q15, q15, q1
++            vceq.i32        q13, q13, #0
++            vceq.i32        q14, q14, #0
++            vceq.i32        q15, q15, #0
++          orrs            r3, r3, r12
++          bne             91f
++          vst1.64         {q8}, [r2]!
++        subs            r1, r1, #32
++        bpl             2b
++
++90:     add             r0, r1, #48
++        bx              lr
++
++91:     sub             r1, r1, #16
++        b               90b
++endfunc
+diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
+index 8a71c04230..53644506e5 100644
+--- a/libavcodec/avcodec.h
++++ b/libavcodec/avcodec.h
+@@ -2595,6 +2595,17 @@ typedef struct AVHWAccel {
+      * that avctx->hwaccel_priv_data is invalid.
+      */
+     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++
++    /**
++     * Called if parsing fails
++     *
++     * An error has occured, end_frame will not be called
++     * start_frame & decode_slice may or may not have been called
++     * Optional
++     *
++     * @param avctx the codec context
++     */
++    void (*abort_frame)(AVCodecContext *avctx);
+ } AVHWAccel;
+ 
+ /**
+diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
+index 38d06b2842..bbf5d70560 100644
+--- a/libavcodec/cabac.h
++++ b/libavcodec/cabac.h
+@@ -44,6 +44,10 @@ typedef struct CABACContext{
+     const uint8_t *bytestream_start;
+     const uint8_t *bytestream;
+     const uint8_t *bytestream_end;
++    struct {
++        uint16_t bits;
++        uint16_t range;
++    } by22;
+ }CABACContext;
+ 
+ int ff_init_cabac_decoder(CABACContext *c, const uint8_t *buf, int buf_size);
+diff --git a/libavcodec/codec.h b/libavcodec/codec.h
+index 50a22f6e3c..5acf572ef4 100644
+--- a/libavcodec/codec.h
++++ b/libavcodec/codec.h
+@@ -367,6 +367,17 @@ const AVCodec *av_codec_iterate(void **opaque);
+  */
+ AVCodec *avcodec_find_decoder(enum AVCodecID id);
+ 
++/**
++ * Find a registered decoder with a matching codec ID and pix_fmt.
++ * A decoder will pix_fmt set to NULL will match any fmt.
++ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
++ *
++ * @param id AVCodecID of the requested decoder
++ * @param fmt AVPixelForma that msut be supported by decoder
++ * @return A decoder if one was found, NULL otherwise.
++ */
++AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
++
+ /**
+  * Find a registered decoder with the specified name.
+  *
+diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
+new file mode 100644
+index 0000000000..72cbba0953
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v1.h
+@@ -0,0 +1,229 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	num_active_dpb_entries;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	num_rps_poc_st_curr_before;
++	__u8	num_rps_poc_st_curr_after;
++	__u8	num_rps_poc_lt_curr;
++
++	__u8	padding;
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
+new file mode 100644
+index 0000000000..7cbbbf055f
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v2.h
+@@ -0,0 +1,257 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
++#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
++#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	rps;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
+new file mode 100644
+index 0000000000..4e35bd583d
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v3.h
+@@ -0,0 +1,255 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * These are the HEVC state controls for use with stateless HEVC
++ * codec drivers.
++ *
++ * It turns out that these structs are not stable yet and will undergo
++ * more changes. So keep them private until they are stable and ready to
++ * become part of the official public API.
++ */
++
++#ifndef _HEVC_CTRLS_H_
++#define _HEVC_CTRLS_H_
++
++#include <linux/videodev2.h>
++
++/* The pixel format isn't stable at the moment and will likely be renamed. */
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
++#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
++#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
++#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
++#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
++
++/* enum v4l2_ctrl_type type values */
++#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
++#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
++#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
++#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
++#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
++
++enum v4l2_mpeg_video_hevc_decode_mode {
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_mpeg_video_hevc_start_code {
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
++	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/* The controls are not stable at the moment and will likely be reworked. */
++struct v4l2_ctrl_hevc_sps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++struct v4l2_ctrl_hevc_pps {
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++
++	__u8	padding[4];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	pic_order_cnt[2];
++	__u8	padding[2];
++};
++
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	padding[6];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_bit_offset;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__u16	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++
++	__u8	padding[5];
++
++	__u32	entry_point_offset_minus1[256];
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u8	num_active_dpb_entries;
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
++#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
++/*
++ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
++ * the number of data (in bits) to skip in the
++ * slice segment header.
++ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
++ * to before syntax element "slice_temporal_mvp_enabled_flag".
++ * If IDR, the skipped bits are just "pic_output_flag"
++ * (separate_colour_plane_flag is not supported).
++ */
++#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
++
++#endif
+diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
+new file mode 100644
+index 0000000000..c02fdbe5a8
+--- /dev/null
++++ b/libavcodec/hevc-ctrls-v4.h
+@@ -0,0 +1,524 @@
++/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
++/*
++ *  Video for Linux Two controls header file
++ *
++ *  Copyright (C) 1999-2012 the contributors
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; either version 2 of the License, or
++ *  (at your option) any later version.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  Alternatively you can redistribute this file under the terms of the
++ *  BSD license as stated below:
++ *
++ *  Redistribution and use in source and binary forms, with or without
++ *  modification, are permitted provided that the following conditions
++ *  are met:
++ *  1. Redistributions of source code must retain the above copyright
++ *     notice, this list of conditions and the following disclaimer.
++ *  2. Redistributions in binary form must reproduce the above copyright
++ *     notice, this list of conditions and the following disclaimer in
++ *     the documentation and/or other materials provided with the
++ *     distribution.
++ *  3. The names of its contributors may not be used to endorse or promote
++ *     products derived from this software without specific prior written
++ *     permission.
++ *
++ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
++ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
++ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
++ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
++ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ *
++ *  The contents of this header was split off from videodev2.h. All control
++ *  definitions should be added to this header, which is included by
++ *  videodev2.h.
++ */
++
++#ifndef AVCODEC_HEVC_CTRLS_V4_H
++#define AVCODEC_HEVC_CTRLS_V4_H
++
++#include <linux/const.h>
++#include <linux/types.h>
++
++#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
++#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
++#endif
++#ifndef V4L2_CID_CODEC_STATELESS_BASE
++#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
++#endif
++
++#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
++
++#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
++#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
++#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
++#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
++
++enum v4l2_stateless_hevc_decode_mode {
++	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
++	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
++};
++
++enum v4l2_stateless_hevc_start_code {
++	V4L2_STATELESS_HEVC_START_CODE_NONE,
++	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
++};
++
++#define V4L2_HEVC_SLICE_TYPE_B	0
++#define V4L2_HEVC_SLICE_TYPE_P	1
++#define V4L2_HEVC_SLICE_TYPE_I	2
++
++#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
++#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
++#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
++#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
++#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
++#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
++#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
++#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
++#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
++
++/**
++ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
++ *
++ * @video_parameter_set_id: specifies the value of the
++ *			vps_video_parameter_set_id of the active VPS
++ * @seq_parameter_set_id: provides an identifier for the SPS for
++ *			  reference by other syntax elements
++ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
++ *				in units of luma samples
++ * @pic_height_in_luma_samples: specifies the height of each decoded picture
++ *				in units of luma samples
++ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
++ *                         samples of the luma array
++ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
++ *                           samples of the chroma arrays
++ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
++ *                                     the variable MaxPicOrderCntLsb
++ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
++ *                                    required size of the decoded picture
++ *                                    buffer for the codec video sequence
++ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
++ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
++ *				    value of SpsMaxLatencyPictures array
++ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
++ *					    luma coding block size
++ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
++ *					      the maximum and minimum luma
++ *					      coding block size
++ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
++ *					       transform block size
++ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
++ *						 the maximum and minimum luma
++ *						 transform block size
++ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in inter
++ *					 prediction mode
++ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
++ *					 depth for transform units of
++ *					 coding units coded in intra
++ *					 prediction mode
++ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
++ *                                    bits used to represent each of PCM sample
++ *                                    values of the luma component
++ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
++ *                                      of bits used to represent each of PCM
++ *                                      sample values of the chroma components
++ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
++ *                                              minimum size of coding blocks
++ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
++ *						  the maximum and minimum size of
++ *						  coding blocks
++ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
++ *				 syntax structures included in the SPS
++ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
++ *				reference pictures that are specified in the SPS
++ * @chroma_format_idc: specifies the chroma sampling
++ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
++ *                             of temporal sub-layers
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_sps {
++	__u8	video_parameter_set_id;
++	__u8	seq_parameter_set_id;
++	__u16	pic_width_in_luma_samples;
++	__u16	pic_height_in_luma_samples;
++	__u8	bit_depth_luma_minus8;
++	__u8	bit_depth_chroma_minus8;
++	__u8	log2_max_pic_order_cnt_lsb_minus4;
++	__u8	sps_max_dec_pic_buffering_minus1;
++	__u8	sps_max_num_reorder_pics;
++	__u8	sps_max_latency_increase_plus1;
++	__u8	log2_min_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_luma_coding_block_size;
++	__u8	log2_min_luma_transform_block_size_minus2;
++	__u8	log2_diff_max_min_luma_transform_block_size;
++	__u8	max_transform_hierarchy_depth_inter;
++	__u8	max_transform_hierarchy_depth_intra;
++	__u8	pcm_sample_bit_depth_luma_minus1;
++	__u8	pcm_sample_bit_depth_chroma_minus1;
++	__u8	log2_min_pcm_luma_coding_block_size_minus3;
++	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
++	__u8	num_short_term_ref_pic_sets;
++	__u8	num_long_term_ref_pics_sps;
++	__u8	chroma_format_idc;
++	__u8	sps_max_sub_layers_minus1;
++
++	__u8	reserved[6];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
++#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
++#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
++#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
++#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
++#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
++#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
++#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
++#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
++#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
++#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
++#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
++#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
++#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
++#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
++#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
++#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
++#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
++#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
++
++/**
++ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
++ *
++ * @pic_parameter_set_id: identifies the PPS for reference by other
++ *			  syntax elements
++ * @num_extra_slice_header_bits: specifies the number of extra slice header
++ *				 bits that are present in the slice header RBSP
++ *				 for coded pictures referring to the PPS.
++ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l0_active_minus1
++ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
++ *                                        inferred value of num_ref_idx_l1_active_minus1
++ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
++ *		     each slice referring to the PPS
++ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
++ *			    tree block size and the minimum luma coding block
++ *			    size of coding units that convey cu_qp_delta_abs
++ *			    and cu_qp_delta_sign_flag
++ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
++ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
++ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
++ *			     partitioning the picture
++ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
++ *			  the picture
++ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
++ *			 units of coding tree blocks
++ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
++ *		       units of coding tree blocks
++ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
++ *			  beta divided by 2
++ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
++ *			divided by 2
++ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
++ *                                    the variable Log2ParMrgLevel
++ * @reserved: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_PPS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_pps {
++	__u8	pic_parameter_set_id;
++	__u8	num_extra_slice_header_bits;
++	__u8	num_ref_idx_l0_default_active_minus1;
++	__u8	num_ref_idx_l1_default_active_minus1;
++	__s8	init_qp_minus26;
++	__u8	diff_cu_qp_delta_depth;
++	__s8	pps_cb_qp_offset;
++	__s8	pps_cr_qp_offset;
++	__u8	num_tile_columns_minus1;
++	__u8	num_tile_rows_minus1;
++	__u8	column_width_minus1[20];
++	__u8	row_height_minus1[22];
++	__s8	pps_beta_offset_div2;
++	__s8	pps_tc_offset_div2;
++	__u8	log2_parallel_merge_level_minus2;
++	__u8	reserved;
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
++
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
++#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
++#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
++#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
++
++#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
++
++/**
++ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
++ *
++ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
++ * @flags: long term flag for the reference frame
++ * @field_pic: whether the reference is a field picture or a frame.
++ * @reserved: padding field. Should be zeroed by applications.
++ * @pic_order_cnt_val: the picture order count of the current picture.
++ */
++struct v4l2_hevc_dpb_entry {
++	__u64	timestamp;
++	__u8	flags;
++	__u8	field_pic;
++	__u16	reserved;
++	__s32	pic_order_cnt_val;
++};
++
++/**
++ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
++ *
++ * @delta_luma_weight_l0: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 0
++ * @luma_offset_l0: the additive offset applied to the luma prediction value
++ *		    for list 0
++ * @delta_chroma_weight_l0: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 0
++ * @chroma_offset_l0: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 0
++ * @delta_luma_weight_l1: the difference of the weighting factor applied
++ *			  to the luma prediction value for list 1
++ * @luma_offset_l1: the additive offset applied to the luma prediction value
++ *		    for list 1
++ * @delta_chroma_weight_l1: the difference of the weighting factor applied
++ *			    to the chroma prediction values for list 1
++ * @chroma_offset_l1: the difference of the additive offset applied to
++ *		      the chroma prediction values for list 1
++ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
++ *			    all luma weighting factors
++ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
++ *				    of the denominator for all chroma
++ *				    weighting factors
++ */
++struct v4l2_hevc_pred_weight_table {
++	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
++
++	__u8	luma_log2_weight_denom;
++	__s8	delta_chroma_log2_weight_denom;
++};
++
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
++#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
++
++/**
++ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
++ *
++ * This control is a dynamically sized 1-dimensional array,
++ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
++ *
++ * @bit_size: size (in bits) of the current slice data
++ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
++ * @num_entry_point_offsets: specifies the number of entry point offset syntax
++ *			     elements in the slice header.
++ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
++ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
++ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
++ * @colour_plane_id: specifies the colour plane associated with the current slice
++ * @slice_pic_order_cnt: specifies the picture order count
++ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 0
++ *                                that may be used to decode the slice
++ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
++ *                                reference index for reference picture list 1
++ *                                that may be used to decode the slice
++ * @collocated_ref_idx: specifies the reference index of the collocated picture used
++ *			for temporal motion vector prediction
++ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
++ *				   motion vector prediction candidates supported in
++ *				   the slice subtracted from 5
++ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
++ *		    blocks in the slice
++ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
++ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
++ * @slice_act_y_qp_offset: screen content extension parameters
++ * @slice_act_cb_qp_offset: screen content extension parameters
++ * @slice_act_cr_qp_offset: screen content extension parameters
++ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
++ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
++ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
++ *		more fields
++ * @reserved0: padding field. Should be zeroed by applications.
++ * @slice_segment_addr: specifies the address of the first coding tree block in
++ *			the slice segment
++ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
++ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS
++ * @pred_weight_table: the prediction weight coefficients for inter-picture
++ *		       prediction
++ * @reserved1: padding field. Should be zeroed by applications.
++ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_slice_params {
++	__u32	bit_size;
++	__u32	data_byte_offset;
++	__u32	num_entry_point_offsets;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++	__u8	nal_unit_type;
++	__u8	nuh_temporal_id_plus1;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u8	slice_type;
++	__u8	colour_plane_id;
++	__s32	slice_pic_order_cnt;
++	__u8	num_ref_idx_l0_active_minus1;
++	__u8	num_ref_idx_l1_active_minus1;
++	__u8	collocated_ref_idx;
++	__u8	five_minus_max_num_merge_cand;
++	__s8	slice_qp_delta;
++	__s8	slice_cb_qp_offset;
++	__s8	slice_cr_qp_offset;
++	__s8	slice_act_y_qp_offset;
++	__s8	slice_act_cb_qp_offset;
++	__s8	slice_act_cr_qp_offset;
++	__s8	slice_beta_offset_div2;
++	__s8	slice_tc_offset_div2;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++	__u8	pic_struct;
++
++	__u8	reserved0[3];
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++	__u32	slice_segment_addr;
++	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++
++	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
++	struct v4l2_hevc_pred_weight_table pred_weight_table;
++
++	__u8	reserved1[2];
++	__u64	flags;
++};
++
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
++#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
++#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
++
++/**
++ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
++ *
++ * @pic_order_cnt_val: picture order count
++ * @short_term_ref_pic_set_size: specifies the size of short-term reference
++ *				 pictures set included in the SPS of the first slice
++ * @long_term_ref_pic_set_size: specifies the size of long-term reference
++ *				pictures set include in the SPS of the first slice
++ * @num_active_dpb_entries: the number of entries in dpb
++ * @num_poc_st_curr_before: the number of reference pictures in the short-term
++ *			    set that come before the current frame
++ * @num_poc_st_curr_after: the number of reference pictures in the short-term
++ *			   set that come after the current frame
++ * @num_poc_lt_curr: the number of reference pictures in the long-term set
++ * @poc_st_curr_before: provides the index of the short term before references
++ *			in DPB array
++ * @poc_st_curr_after: provides the index of the short term after references
++ *		       in DPB array
++ * @poc_lt_curr: provides the index of the long term references in DPB array
++ * @reserved: padding field. Should be zeroed by applications.
++ * @dpb: the decoded picture buffer, for meta-data about reference frames
++ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
++ */
++struct v4l2_ctrl_hevc_decode_params {
++	__s32	pic_order_cnt_val;
++	__u16	short_term_ref_pic_set_size;
++	__u16	long_term_ref_pic_set_size;
++	__u8	num_active_dpb_entries;
++	__u8	num_poc_st_curr_before;
++	__u8	num_poc_st_curr_after;
++	__u8	num_poc_lt_curr;
++	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u8	reserved[4];
++	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
++	__u64	flags;
++};
++
++/**
++ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
++ *
++ * @scaling_list_4x4: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_8x8: scaling list is used for the scaling process for
++ *		      transform coefficients. The values on each scaling
++ *		      list are expected in raster scan order
++ * @scaling_list_16x16:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_32x32:	scaling list is used for the scaling process for
++ *			transform coefficients. The values on each scaling
++ *			list are expected in raster scan order
++ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
++ *				for transform coefficients. The values on each
++ *				scaling list are expected in raster scan order.
++ */
++struct v4l2_ctrl_hevc_scaling_matrix {
++	__u8	scaling_list_4x4[6][16];
++	__u8	scaling_list_8x8[6][64];
++	__u8	scaling_list_16x16[6][64];
++	__u8	scaling_list_32x32[2][64];
++	__u8	scaling_list_dc_coef_16x16[6];
++	__u8	scaling_list_dc_coef_32x32[2];
++};
++
++#endif
+diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
+index 463d352055..7feff43c28 100644
+--- a/libavcodec/hevc_parser.c
++++ b/libavcodec/hevc_parser.c
+@@ -98,6 +98,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
+     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
+     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
+ 
++    if (ps->sps->chroma_format_idc == 1) {
++        avctx->chroma_sample_location = ps->sps->vui.chroma_loc_info_present_flag ?
++            ps->sps->vui.chroma_sample_loc_type_top_field + 1 :
++            AVCHROMA_LOC_LEFT;
++    }
++    else if (ps->sps->chroma_format_idc == 2 ||
++             ps->sps->chroma_format_idc == 3) {
++        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++    }
++    else {
++        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++    }
++
+     if (ps->vps->vps_timing_info_present_flag) {
+         num = ps->vps->vps_num_units_in_tick;
+         den = ps->vps->vps_time_scale;
+diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
+index 4f6d985ae6..eefae71275 100644
+--- a/libavcodec/hevc_refs.c
++++ b/libavcodec/hevc_refs.c
+@@ -96,18 +96,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
+         if (!frame->rpl_buf)
+             goto fail;
+ 
+-        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
+-        if (!frame->tab_mvf_buf)
+-            goto fail;
+-        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        if (s->tab_mvf_pool) {
++            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
++            if (!frame->tab_mvf_buf)
++                goto fail;
++            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
++        }
+ 
+-        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
+-        if (!frame->rpl_tab_buf)
+-            goto fail;
+-        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
+-        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
+-        for (j = 0; j < frame->ctb_count; j++)
+-            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        if (s->rpl_tab_pool) {
++            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
++            if (!frame->rpl_tab_buf)
++                goto fail;
++            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
++            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
++            for (j = 0; j < frame->ctb_count; j++)
++                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
++        }
+ 
+         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
+         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
+@@ -276,14 +280,17 @@ static int init_slice_rpl(HEVCContext *s)
+     int ctb_count    = frame->ctb_count;
+     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
+     int i;
++    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
+ 
+     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
+         return AVERROR_INVALIDDATA;
+ 
+-    for (i = ctb_addr_ts; i < ctb_count; i++)
+-        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
++    if (frame->rpl_tab) {
++        for (i = ctb_addr_ts; i < ctb_count; i++)
++            frame->rpl_tab[i] = tab;
++    }
+ 
+-    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
++    frame->refPicList = tab->refPicList;
+ 
+     return 0;
+ }
+diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
+index 2231aed259..7b05b41441 100644
+--- a/libavcodec/hevcdec.c
++++ b/libavcodec/hevcdec.c
+@@ -333,6 +333,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
+ 
+     ff_set_sar(avctx, sps->vui.sar);
+ 
++    if (sps->chroma_format_idc == 1) {
++        avctx->chroma_sample_location = sps->vui.chroma_loc_info_present_flag ?
++            sps->vui.chroma_sample_loc_type_top_field + 1 :
++            AVCHROMA_LOC_LEFT;
++    }
++    else if (sps->chroma_format_idc == 2 ||
++             sps->chroma_format_idc == 3) {
++        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
++    }
++    else {
++        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
++    }
++
+     if (sps->vui.video_signal_type_present_flag)
+         avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
+                                                             : AVCOL_RANGE_MPEG;
+@@ -392,14 +405,20 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
+                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
+                      CONFIG_HEVC_NVDEC_HWACCEL + \
++                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
+                      CONFIG_HEVC_VAAPI_HWACCEL + \
+                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
++                     CONFIG_HEVC_RPI4_8_HWACCEL + \
++                     CONFIG_HEVC_RPI4_10_HWACCEL + \
+                      CONFIG_HEVC_VDPAU_HWACCEL)
+     enum AVPixelFormat pix_fmts[HWACCEL_MAX + 2], *fmt = pix_fmts;
+ 
+     switch (sps->pix_fmt) {
+     case AV_PIX_FMT_YUV420P:
+     case AV_PIX_FMT_YUVJ420P:
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++        *fmt++ = AV_PIX_FMT_RPI4_8;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -418,9 +437,15 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
+         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV420P10:
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++        *fmt++ = AV_PIX_FMT_RPI4_10;
++#endif
+ #if CONFIG_HEVC_DXVA2_HWACCEL
+         *fmt++ = AV_PIX_FMT_DXVA2_VLD;
+ #endif
+@@ -439,6 +464,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
+ #endif
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+         *fmt++ = AV_PIX_FMT_CUDA;
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++        *fmt++ = AV_PIX_FMT_DRM_PRIME;
+ #endif
+         break;
+     case AV_PIX_FMT_YUV444P:
+@@ -485,6 +513,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
+     if (!sps)
+         return 0;
+ 
++    // If hwaccel then we don't need all the s/w decode helper arrays
++    if (s->avctx->hwaccel) {
++        export_stream_params(s, sps);
++
++        s->avctx->pix_fmt = pix_fmt;
++        s->ps.sps = sps;
++        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++        return 0;
++    }
++
+     ret = pic_arrays_init(s, sps);
+     if (ret < 0)
+         goto fail;
+@@ -2901,11 +2939,13 @@ static int hevc_frame_start(HEVCContext *s)
+                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
+     int ret;
+ 
+-    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
+-    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
+-    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
+-    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
+-    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    if (s->horizontal_bs) {
++        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
++        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
++        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
++        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
++        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
++    }
+ 
+     s->is_decoded        = 0;
+     s->first_nal_type    = s->nal_unit_type;
+@@ -3327,7 +3367,14 @@ static int hevc_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
+     s->ref = NULL;
+     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
+     if (ret < 0)
++    {
++        // Ensure that hwaccel knows this frame is over
++        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame) {
++            s->avctx->hwaccel->abort_frame(s->avctx);
++        }
++
+         return ret;
++    }
+ 
+     if (avctx->hwaccel) {
+         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
+@@ -3370,15 +3417,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
+     if (ret < 0)
+         return ret;
+ 
+-    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
+-    if (!dst->tab_mvf_buf)
+-        goto fail;
+-    dst->tab_mvf = src->tab_mvf;
++    if (src->tab_mvf_buf) {
++        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
++        if (!dst->tab_mvf_buf)
++            goto fail;
++        dst->tab_mvf = src->tab_mvf;
++    }
+ 
+-    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
+-    if (!dst->rpl_tab_buf)
+-        goto fail;
+-    dst->rpl_tab = src->rpl_tab;
++    if (src->rpl_tab_buf) {
++        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
++        if (!dst->rpl_tab_buf)
++            goto fail;
++        dst->rpl_tab = src->rpl_tab;
++    }
+ 
+     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
+     if (!dst->rpl_buf)
+@@ -3697,6 +3748,15 @@ AVCodec ff_hevc_decoder = {
+ #if CONFIG_HEVC_NVDEC_HWACCEL
+                                HWACCEL_NVDEC(hevc),
+ #endif
++#if CONFIG_HEVC_RPI4_8_HWACCEL
++                               HWACCEL_RPI4_8(hevc),
++#endif
++#if CONFIG_HEVC_RPI4_10_HWACCEL
++                               HWACCEL_RPI4_10(hevc),
++#endif
++#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
++                               HWACCEL_V4L2REQUEST(hevc),
++#endif
+ #if CONFIG_HEVC_VAAPI_HWACCEL
+                                HWACCEL_VAAPI(hevc),
+ #endif
+diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
+index 8e54cf73f9..2277aadf75 100644
+--- a/libavcodec/hwaccels.h
++++ b/libavcodec/hwaccels.h
+@@ -39,6 +39,9 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
+ extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
+ extern const AVHWAccel ff_hevc_dxva2_hwaccel;
+ extern const AVHWAccel ff_hevc_nvdec_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_8_hwaccel;
++extern const AVHWAccel ff_hevc_rpi4_10_hwaccel;
++extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
+ extern const AVHWAccel ff_hevc_vaapi_hwaccel;
+ extern const AVHWAccel ff_hevc_vdpau_hwaccel;
+ extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
+diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
+index f421dc909f..f93283b893 100644
+--- a/libavcodec/hwconfig.h
++++ b/libavcodec/hwconfig.h
+@@ -24,6 +24,7 @@
+ 
+ 
+ #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
++#define HWACCEL_CAP_MT_SAFE         (1 << 1)
+ 
+ 
+ typedef struct AVCodecHWConfigInternal {
+@@ -70,6 +71,12 @@ typedef struct AVCodecHWConfigInternal {
+     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
+ #define HWACCEL_NVDEC(codec) \
+     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
++#define HWACCEL_RPI4_8(codec) \
++    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_8,       NONE,         ff_ ## codec ## _rpi4_8_hwaccel)
++#define HWACCEL_RPI4_10(codec) \
++    HW_CONFIG_HWACCEL(0, 0, 1, RPI4_10,      NONE,         ff_ ## codec ## _rpi4_10_hwaccel)
++#define HWACCEL_V4L2REQUEST(codec) \
++    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
+ #define HWACCEL_VAAPI(codec) \
+     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
+ #define HWACCEL_VDPAU(codec) \
+diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
+index cb15ac072a..f6261db962 100644
+--- a/libavcodec/mmaldec.c
++++ b/libavcodec/mmaldec.c
+@@ -24,6 +24,9 @@
+  * MMAL Video Decoder
+  */
+ 
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
+ #include <bcm_host.h>
+ #include <interface/mmal/mmal.h>
+ #include <interface/mmal/mmal_parameters_video.h>
+@@ -31,6 +34,7 @@
+ #include <interface/mmal/util/mmal_util_params.h>
+ #include <interface/mmal/util/mmal_default_components.h>
+ #include <interface/mmal/vc/mmal_vc_api.h>
++#pragma GCC diagnostic pop
+ #include <stdatomic.h>
+ 
+ #include "avcodec.h"
+diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
+index 9176027f15..0b0ff03c18 100644
+--- a/libavcodec/pthread_frame.c
++++ b/libavcodec/pthread_frame.c
+@@ -209,7 +209,8 @@ FF_ENABLE_DEPRECATION_WARNINGS
+ 
+         /* if the previous thread uses hwaccel then we take the lock to ensure
+          * the threads don't run concurrently */
+-        if (avctx->hwaccel) {
++        if (avctx->hwaccel &&
++            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
+             pthread_mutex_lock(&p->parent->hwaccel_mutex);
+             p->hwaccel_serializing = 1;
+         }
+@@ -636,7 +637,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
+ 
+     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
+ 
+-    if (avctx->hwaccel && !p->hwaccel_serializing) {
++    if (avctx->hwaccel &&
++        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
++        !p->hwaccel_serializing) {
+         pthread_mutex_lock(&p->parent->hwaccel_mutex);
+         p->hwaccel_serializing = 1;
+     }
+diff --git a/libavcodec/raw.c b/libavcodec/raw.c
+index 079d5c5d10..0781f28615 100644
+--- a/libavcodec/raw.c
++++ b/libavcodec/raw.c
+@@ -294,6 +294,12 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
+     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
+     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
+ 
++    /* RPI (Might as well define for everything) */
++    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
++    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
++    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
++
+     { AV_PIX_FMT_NONE, 0 },
+ };
+ 
+diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
+index d181b74570..b943dd0379 100644
+--- a/libavcodec/rawenc.c
++++ b/libavcodec/rawenc.c
+@@ -24,6 +24,7 @@
+  * Raw Video Encoder
+  */
+ 
++#include "config.h"
+ #include "avcodec.h"
+ #include "raw.h"
+ #include "internal.h"
+@@ -31,6 +32,10 @@
+ #include "libavutil/intreadwrite.h"
+ #include "libavutil/imgutils.h"
+ #include "libavutil/internal.h"
++#include "libavutil/avassert.h"
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static av_cold int raw_encode_init(AVCodecContext *avctx)
+ {
+@@ -49,22 +54,114 @@ FF_ENABLE_DEPRECATION_WARNINGS
+     return 0;
+ }
+ 
++#if CONFIG_SAND
++static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3 / 2;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height;
++    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
++    return 0;
++}
++
++static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
++    dst += width * height * 2;
++    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
++    return 0;
++}
++
++static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
++                      const AVFrame *frame)
++{
++    const int width = av_frame_cropped_width(frame);
++    const int height = av_frame_cropped_height(frame);
++    const int x0 = frame->crop_left;
++    const int y0 = frame->crop_top;
++    const int size = width * height * 3;
++    uint8_t * dst;
++    int ret;
++
++    if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
++        return ret;
++
++    dst = pkt->data;
++
++    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
++    dst += width * height * 2;
++    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
++                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
++    return 0;
++}
++#endif
++
++
+ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+-                      const AVFrame *frame, int *got_packet)
++                      const AVFrame *src_frame, int *got_packet)
+ {
+-    int ret = av_image_get_buffer_size(frame->format,
+-                                       frame->width, frame->height, 1);
++    int ret;
++    AVFrame * frame = NULL;
+ 
+-    if (ret < 0)
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(src_frame)) {
++        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
++            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
++            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
++        *got_packet = (ret == 0);
+         return ret;
++    }
++#endif
++
++    if ((frame = av_frame_clone(src_frame)) == NULL) {
++        ret = AVERROR(ENOMEM);
++        goto fail;
++    }
++
++    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
++        goto fail;
++
++    ret = av_image_get_buffer_size(frame->format,
++                                       frame->width, frame->height, 1);
++    if (ret < 0)
++        goto fail;
+ 
+     if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
+-        return ret;
++        goto fail;
+     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
+                                        (const uint8_t **)frame->data, frame->linesize,
+                                        frame->format,
+                                        frame->width, frame->height, 1)) < 0)
+-        return ret;
++        goto fail;
+ 
+     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
+        frame->format   == AV_PIX_FMT_YUYV422) {
+@@ -81,8 +178,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
+         }
+     }
+     pkt->flags |= AV_PKT_FLAG_KEY;
++    av_frame_free(&frame);
+     *got_packet = 1;
+     return 0;
++
++fail:
++    av_frame_free(&frame);
++    *got_packet = 0;
++    return ret;
+ }
+ 
+ AVCodec ff_rawvideo_encoder = {
+diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
+new file mode 100644
+index 0000000000..58c094c5f8
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac.c
+@@ -0,0 +1,2257 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#define UNCHECKED_BITSTREAM_READER 1
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++
++#include "cabac_functions.h"
++#include "rpi_hevc_data.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++
++#include "libavutil/rpi_sand_fns.h"
++
++// BY22 is probably faster than simple bypass if the processor has
++// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
++// x86 has fast int divide
++// Arm doesn't have divide or general fast 64 bit, but does have the multiply
++// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
++#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
++// Use native divide if we have a fast one - otherwise use mpy 1/x
++// x86 has a fast integer divide - arm doesn't - unsure about other
++// architectures
++#define USE_BY22_DIV  ARCH_X86
++
++// Special case blocks with a single significant ceoff
++// Decreases the complexity of the code for a common case but increases the
++// code size.
++#define USE_N_END_1 1
++
++#if !USE_BY22_DIV
++// * 1/x @ 32 bits gets us 22 bits of accuracy
++#define CABAC_BY22_PEEK_BITS  22
++#else
++// A real 32-bit divide gets us another bit
++// If we have a 64 bit int & a unit time divider then we should get a lot
++// of bits (55)  but that is untested and it is unclear if it would give
++// us a large advantage
++#define CABAC_BY22_PEEK_BITS  23
++#endif
++
++#define CABAC_MAX_BIN 31
++
++
++#if USE_BY22 && !USE_BY22_DIV
++#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
++
++static const uint32_t cabac_by22_inv_range[256] = {
++                                                    0,      I(257), I(258), I(259),
++    I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
++    I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
++    I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
++    I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
++    I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
++    I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
++    I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
++    I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
++    I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
++    I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
++    I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
++    I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
++    I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
++    I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
++    I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
++    I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
++    I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
++    I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
++    I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
++    I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
++    I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
++    I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
++    I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
++    I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
++    I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
++    I(510), I(511)
++};
++#undef I
++#endif  // USE_BY22
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_cabac.h"
++#endif
++
++/**
++ * number of bin by SyntaxElement.
++ */
++static const int8_t num_bins_in_se[] = {
++     1, // sao_merge_flag
++     1, // sao_type_idx
++     0, // sao_eo_class
++     0, // sao_band_position
++     0, // sao_offset_abs
++     0, // sao_offset_sign
++     0, // end_of_slice_flag
++     3, // split_coding_unit_flag
++     1, // cu_transquant_bypass_flag
++     3, // skip_flag
++     3, // cu_qp_delta
++     1, // pred_mode
++     4, // part_mode
++     0, // pcm_flag
++     1, // prev_intra_luma_pred_mode
++     0, // mpm_idx
++     0, // rem_intra_luma_pred_mode
++     2, // intra_chroma_pred_mode
++     1, // merge_flag
++     1, // merge_idx
++     5, // inter_pred_idc
++     2, // ref_idx_l0
++     2, // ref_idx_l1
++     2, // abs_mvd_greater0_flag
++     2, // abs_mvd_greater1_flag
++     0, // abs_mvd_minus2
++     0, // mvd_sign_flag
++     1, // mvp_lx_flag
++     1, // no_residual_data_flag
++     3, // split_transform_flag
++     2, // cbf_luma
++     4, // cbf_cb, cbf_cr
++     2, // transform_skip_flag[][]
++     2, // explicit_rdpcm_flag[][]
++     2, // explicit_rdpcm_dir_flag[][]
++    18, // last_significant_coeff_x_prefix
++    18, // last_significant_coeff_y_prefix
++     0, // last_significant_coeff_x_suffix
++     0, // last_significant_coeff_y_suffix
++     4, // significant_coeff_group_flag
++    44, // significant_coeff_flag
++    24, // coeff_abs_level_greater1_flag
++     6, // coeff_abs_level_greater2_flag
++     0, // coeff_abs_level_remaining
++     0, // coeff_sign_flag
++     8, // log2_res_scale_abs
++     2, // res_scale_sign_flag
++     1, // cu_chroma_qp_offset_flag
++     1, // cu_chroma_qp_offset_idx
++};
++
++/**
++ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
++ */
++static const int elem_offset[sizeof(num_bins_in_se)] = {
++    0, // sao_merge_flag
++    1, // sao_type_idx
++    2, // sao_eo_class
++    2, // sao_band_position
++    2, // sao_offset_abs
++    2, // sao_offset_sign
++    2, // end_of_slice_flag
++    2, // split_coding_unit_flag
++    5, // cu_transquant_bypass_flag
++    6, // skip_flag
++    9, // cu_qp_delta
++    12, // pred_mode
++    13, // part_mode
++    17, // pcm_flag
++    17, // prev_intra_luma_pred_mode
++    18, // mpm_idx
++    18, // rem_intra_luma_pred_mode
++    18, // intra_chroma_pred_mode
++    20, // merge_flag
++    21, // merge_idx
++    22, // inter_pred_idc
++    27, // ref_idx_l0
++    29, // ref_idx_l1
++    31, // abs_mvd_greater0_flag
++    33, // abs_mvd_greater1_flag
++    35, // abs_mvd_minus2
++    35, // mvd_sign_flag
++    35, // mvp_lx_flag
++    36, // no_residual_data_flag
++    37, // split_transform_flag
++    40, // cbf_luma
++    42, // cbf_cb, cbf_cr
++    46, // transform_skip_flag[][]
++    48, // explicit_rdpcm_flag[][]
++    50, // explicit_rdpcm_dir_flag[][]
++    52, // last_significant_coeff_x_prefix
++    70, // last_significant_coeff_y_prefix
++    88, // last_significant_coeff_x_suffix
++    88, // last_significant_coeff_y_suffix
++    88, // significant_coeff_group_flag
++    92, // significant_coeff_flag
++    136, // coeff_abs_level_greater1_flag
++    160, // coeff_abs_level_greater2_flag
++    166, // coeff_abs_level_remaining
++    166, // coeff_sign_flag
++    166, // log2_res_scale_abs
++    174, // res_scale_sign_flag
++    176, // cu_chroma_qp_offset_flag
++    177, // cu_chroma_qp_offset_idx
++};
++
++#define CNU 154
++/**
++ * Indexed by init_type
++ */
++static const uint8_t init_values[3][HEVC_CONTEXTS] = {
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      200,
++      // split_coding_unit_flag
++      139, 141, 157,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      CNU, CNU, CNU,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      CNU,
++      // part_mode
++      184, CNU, CNU, CNU,
++      // prev_intra_luma_pred_mode
++      184,
++      // intra_chroma_pred_mode
++      63, 139,
++      // merge_flag
++      CNU,
++      // merge_idx
++      CNU,
++      // inter_pred_idc
++      CNU, CNU, CNU, CNU, CNU,
++      // ref_idx_l0
++      CNU, CNU,
++      // ref_idx_l1
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // abs_mvd_greater1_flag
++      CNU, CNU,
++      // mvp_lx_flag
++      CNU,
++      // no_residual_data_flag
++      CNU,
++      // split_transform_flag
++      153, 138, 138,
++      // cbf_luma
++      111, 141,
++      // cbf_cb, cbf_cr
++      94, 138, 182, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // last_significant_coeff_y_prefix
++      110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
++       79, 108, 123,  63,
++      // significant_coeff_group_flag
++      91, 171, 134, 141,
++      // significant_coeff_flag
++      111, 111, 125, 110, 110,  94, 124, 108, 124, 107, 125, 141, 179, 153,
++      125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
++      139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
++      141, 111,
++      // coeff_abs_level_greater1_flag
++      140,  92, 137, 138, 140, 152, 138, 139, 153,  74, 149,  92, 139, 107,
++      122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
++      // coeff_abs_level_greater2_flag
++      138, 153, 136, 167, 152, 152,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      185,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      149,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      154,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      110,
++      // merge_idx
++      122,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // abs_mvd_greater1_flag
++      140, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      124, 138, 94,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 107, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // last_significant_coeff_y_prefix
++      125, 110,  94, 110,  95,  79, 125, 111, 110,  78, 110, 111, 111,  95,
++       94, 108, 123, 108,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      155, 154, 139, 153, 139, 123, 123,  63, 153, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 122, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++    { // sao_merge_flag
++      153,
++      // sao_type_idx
++      160,
++      // split_coding_unit_flag
++      107, 139, 126,
++      // cu_transquant_bypass_flag
++      154,
++      // skip_flag
++      197, 185, 201,
++      // cu_qp_delta
++      154, 154, 154,
++      // pred_mode
++      134,
++      // part_mode
++      154, 139, 154, 154,
++      // prev_intra_luma_pred_mode
++      183,
++      // intra_chroma_pred_mode
++      152, 139,
++      // merge_flag
++      154,
++      // merge_idx
++      137,
++      // inter_pred_idc
++      95, 79, 63, 31, 31,
++      // ref_idx_l0
++      153, 153,
++      // ref_idx_l1
++      153, 153,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // abs_mvd_greater1_flag
++      169, 198,
++      // mvp_lx_flag
++      168,
++      // no_residual_data_flag
++      79,
++      // split_transform_flag
++      224, 167, 122,
++      // cbf_luma
++      153, 111,
++      // cbf_cb, cbf_cr
++      149, 92, 167, 154,
++      // transform_skip_flag
++      139, 139,
++      // explicit_rdpcm_flag
++      139, 139,
++      // explicit_rdpcm_dir_flag
++      139, 139,
++      // last_significant_coeff_x_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // last_significant_coeff_y_prefix
++      125, 110, 124, 110,  95,  94, 125, 111, 111,  79, 125, 126, 111, 111,
++       79, 108, 123,  93,
++      // significant_coeff_group_flag
++      121, 140, 61, 154,
++      // significant_coeff_flag
++      170, 154, 139, 153, 139, 123, 123,  63, 124, 166, 183, 140, 136, 153,
++      154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
++      153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
++      140, 140,
++      // coeff_abs_level_greater1_flag
++      154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
++      136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
++      // coeff_abs_level_greater2_flag
++      107, 167, 91, 107, 107, 167,
++      // log2_res_scale_abs
++      154, 154, 154, 154, 154, 154, 154, 154,
++      // res_scale_sign_flag
++      154, 154,
++      // cu_chroma_qp_offset_flag
++      154,
++      // cu_chroma_qp_offset_idx
++      154,
++    },
++};
++
++static const uint8_t scan_1x1[1] = {
++    0,
++};
++
++static const uint8_t horiz_scan2x2_x[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t horiz_scan2x2_y[4] = {
++    0, 0, 1, 1
++};
++
++static const uint8_t horiz_scan4x4_x[16] = {
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++    0, 1, 2, 3,
++};
++
++static const uint8_t horiz_scan4x4_y[16] = {
++    0, 0, 0, 0,
++    1, 1, 1, 1,
++    2, 2, 2, 2,
++    3, 3, 3, 3,
++};
++
++static const uint8_t horiz_scan8x8_inv[8][8] = {
++    {  0,  1,  2,  3, 16, 17, 18, 19, },
++    {  4,  5,  6,  7, 20, 21, 22, 23, },
++    {  8,  9, 10, 11, 24, 25, 26, 27, },
++    { 12, 13, 14, 15, 28, 29, 30, 31, },
++    { 32, 33, 34, 35, 48, 49, 50, 51, },
++    { 36, 37, 38, 39, 52, 53, 54, 55, },
++    { 40, 41, 42, 43, 56, 57, 58, 59, },
++    { 44, 45, 46, 47, 60, 61, 62, 63, },
++};
++
++static const uint8_t diag_scan2x2_x[4] = {
++    0, 0, 1, 1,
++};
++
++static const uint8_t diag_scan2x2_y[4] = {
++    0, 1, 0, 1,
++};
++
++static const uint8_t diag_scan2x2_inv[2][2] = {
++    { 0, 2, },
++    { 1, 3, },
++};
++
++static const uint8_t diag_scan4x4_inv[4][4] = {
++    { 0,  2,  5,  9, },
++    { 1,  4,  8, 12, },
++    { 3,  7, 11, 14, },
++    { 6, 10, 13, 15, },
++};
++
++static const uint8_t diag_scan8x8_inv[8][8] = {
++    {  0,  2,  5,  9, 14, 20, 27, 35, },
++    {  1,  4,  8, 13, 19, 26, 34, 42, },
++    {  3,  7, 12, 18, 25, 33, 41, 48, },
++    {  6, 11, 17, 24, 32, 40, 47, 53, },
++    { 10, 16, 23, 31, 39, 46, 52, 57, },
++    { 15, 22, 30, 38, 45, 51, 56, 60, },
++    { 21, 29, 37, 44, 50, 55, 59, 62, },
++    { 28, 36, 43, 49, 54, 58, 61, 63, },
++};
++
++
++typedef struct
++{
++    uint16_t coeff;
++    uint16_t scale;
++} xy_off_t;
++
++#define XYT_C(x,y,t) ((x) + ((y) << (t)))
++#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
++#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
++#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
++
++#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
++
++#define OFF_DIAG(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
++    XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
++    XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
++    XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++#define OFF_HORIZ(t) {\
++    XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
++    XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
++    XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
++    XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
++}
++
++#define OFF_VERT(t) {\
++    XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
++    XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
++    XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
++    XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
++}
++
++static const xy_off_t off_xys[3][4][16] =
++{
++    {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
++    {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
++    {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
++};
++
++
++// Helper fns
++#ifndef hevc_mem_bits32
++static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
++{
++    return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
++}
++#endif
++
++#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
++#define hevc_clz32 hevc_clz32_builtin
++static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
++{
++    // __builtin_clz says it works on ints - so adjust if int is >32 bits long
++    return __builtin_clz(x) - (sizeof(int) * 8 - 32);
++}
++#endif
++
++// It is unlikely that we will ever need this but include for completeness
++#ifndef hevc_clz32
++static inline unsigned int hevc_clz32(unsigned int x)
++{
++    unsigned int n = 1;
++    if ((x & 0xffff0000) == 0) {
++        n += 16;
++        x <<= 16;
++    }
++    if ((x & 0xff000000) == 0) {
++        n += 8;
++        x <<= 8;
++    }
++    if ((x & 0xf0000000) == 0) {
++        n += 4;
++        x <<= 4;
++    }
++    if ((x & 0xc0000000) == 0) {
++        n += 2;
++        x <<= 2;
++    }
++    return n - ((x >> 31) & 1);
++}
++#endif
++
++static inline int cabac_overflow(const CABACContext * const cc)
++{
++    av_assert0(cc->bytestream >= cc->bytestream_start);
++    return cc->bytestream >= cc->bytestream_end + 4;
++}
++
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
++{
++    return cabac_overflow(&lc->cc);
++}
++
++#if !USE_BY22
++// If no by22 then _by22 functions will revert to normal and so _peek/_flush
++// will no longer be called but the setup calls will still exist and we want
++// to null them out
++#define bypass_start(s)
++#define bypass_finish(s)
++#else
++// Use BY22 for residual bypass block
++
++#define bypass_start(cc) get_cabac_by22_start(cc)
++#define bypass_finish(cc) get_cabac_by22_finish(cc)
++
++// BY22 notes that bypass is simply a divide into the bitstream and so we
++// can peek out large quantities of bits at once and treat the result as if
++// it was VLC.  In many cases this will lead to O(1) processing rather than
++// O(n) though the setup and teardown is sufficiently expensive that it is
++// only worth using if we expect to be dealing with more than a few bits
++// The definition of "a few bits" will vary from platform to platform but
++// tests on ARM show that it probably isn't worth it for a single coded
++// residual, but is for >1 - it also seems likely that if there are
++// more residuals then they are likely to be bigger and this will make the
++// O(1) nature of the code more worthwhile.
++
++
++// Bypass block start
++// Must be called before _by22_peek is used as it sets the CABAC environment
++// into the correct state.  _by22_finish must be called to return to 'normal'
++// (i.e. non-bypass) cabac decoding
++#ifndef get_cabac_by22_start
++static inline void get_cabac_by22_start(CABACContext * const c)
++{
++    const unsigned int bits = __builtin_ctz(c->low);
++    const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
++    uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
++#if !USE_BY22_DIV
++    const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
++#endif
++
++    c->bytestream -= (CABAC_BITS / 8);
++    c->by22.bits = bits;
++#if !USE_BY22_DIV
++    c->by22.range = c->range;
++    c->range = inv;
++#endif
++    c->low = x;
++}
++#endif
++
++// Bypass block finish
++// Must be called at the end of the bypass block to return to normal operation
++static inline void get_cabac_by22_finish(CABACContext * const c)
++{
++    unsigned int used = c->by22.bits;
++    unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
++    unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
++
++    c->bytestream += bytes_used + (CABAC_BITS / 8);
++    c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
++#if !USE_BY22_DIV
++    c->range = c->by22.range;
++#endif
++}
++
++// Peek bypass bits
++// _by22_start must be called before _by22_peek is called and _by22_flush
++// must be called afterwards to flush any used bits
++// The actual number of valid bits returned is
++// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
++// will be at least 22 which should be long enough for any prefix or suffix
++// though probably not long enough for the worst case combination
++#ifndef get_cabac_by22_peek
++static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
++{
++#if USE_BY22_DIV
++    return ((unsigned int)c->low / (unsigned int)c->range) << 9;
++#else
++    uint32_t x = c->low & ~1U;
++    const uint32_t inv = c->range;
++
++    if (inv != 0)
++        x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
++
++    return x << 1;
++#endif
++}
++#endif
++
++// Flush bypass bits peeked by _by22_peek
++// Flush n bypass bits. n must be >= 1 to guarantee correct operation
++// val is an unmodified copy of whatever _by22_peek returned
++#ifndef get_cabac_by22_flush
++static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
++{
++    // Subtract the bits used & reshift up to the top of the word
++#if USE_BY22_DIV
++    const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
++#else
++    const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
++#endif
++
++    // and refill lower bits
++    // We will probably OR over some existing bits but that doesn't matter
++    c->by22.bits += n;
++    c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
++}
++#endif
++
++#endif  // USE_BY22
++
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
++{
++    memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
++    memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
++}
++
++static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
++    memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
++}
++
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
++{
++    GetBitContext * const gb = &lc->gb;
++    skip_bits(gb, 1);
++    align_get_bits(gb);
++    return ff_init_cabac_decoder(&lc->cc,
++                          gb->buffer + get_bits_count(gb) / 8,
++                          (get_bits_left(gb) + 7) / 8);
++}
++
++static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int init_type = 2 - s->sh.slice_type;
++    int i;
++
++    if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
++        init_type ^= 3;
++
++    for (i = 0; i < HEVC_CONTEXTS; i++) {
++        int init_value = init_values[init_type][i];
++        int m = (init_value >> 4) * 5 - 45;
++        int n = ((init_value & 15) << 3) - 16;
++        int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
++
++        pre ^= pre >> 31;
++        if (pre > 124)
++            pre = 124 + (pre & 1);
++        lc->cabac_state[i] = pre;
++    }
++
++    for (i = 0; i < 4; i++)
++        lc->stat_coeff[i] = 0;
++}
++
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
++{
++    if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        cabac_init_state(s, lc);
++    }
++    else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
++    {
++        lc->qPy_pred = s->sh.slice_qp;
++        load_states(s, lc);
++    }
++    lc->cabac_init_req = 0;
++}
++
++#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
++{
++    return get_cabac_inline(c, state);
++}
++
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
++{
++    return get_cabac_terminate(c);
++}
++
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
++{
++    if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
++        return 0;
++
++    if (!get_cabac_bypass(&lc->cc))
++        return SAO_BAND;
++    return SAO_EDGE;
++}
++
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 0; i < 4; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
++    int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
++
++    while (i < length && get_cabac_bypass(&lc->cc))
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
++{
++    return get_cabac_bypass(&lc->cc);
++}
++
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret = get_cabac_bypass(&lc->cc) << 1;
++    ret    |= get_cabac_bypass(&lc->cc);
++    return ret;
++}
++
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
++{
++    int val = 1;
++
++    if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
++        return 0;
++
++    while (val < 5 &&
++           get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
++        val++;
++
++    if (val >= 5) {
++        unsigned int k = 0;
++        while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++            val += 1 << k;
++            k++;
++        }
++//        if (k == CABAC_MAX_BIN)
++//            av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++
++        while (k--)
++            val += get_cabac_bypass(&lc->cc) << k;
++    }
++    return get_cabac_bypass(&lc->cc) ? -val : val;
++}
++
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
++    int i = 0;
++
++    while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
++        i++;
++
++    return i;
++}
++
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
++{
++    if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
++        return PART_2Nx2N;
++    if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
++        if (lc->cu.pred_mode == MODE_INTRA) // 0
++            return PART_NxN;
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_2NxN;
++        if (log2_cb_size == 3) // 00
++            return PART_Nx2N;
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
++            return PART_Nx2N;
++        return PART_NxN; // 000
++    }
++
++    if (!s->ps.sps->amp_enabled_flag) {
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
++            return PART_2NxN;
++        return PART_Nx2N;
++    }
++
++    if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
++        if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
++            return PART_2NxN;
++        if (get_cabac_bypass(&lc->cc)) // 0101
++            return PART_2NxnD;
++        return PART_2NxnU; // 0100
++    }
++
++    if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
++        return PART_Nx2N;
++    if (get_cabac_bypass(&lc->cc)) // 0001
++        return PART_nRx2N;
++    return PART_nLx2N;  // 0000
++}
++
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
++{
++    int i = 0;
++    while (i < 2 && get_cabac_bypass(&lc->cc))
++        i++;
++    return i;
++}
++
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int i;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 0; i < 4; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret;
++    if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
++        return 4;
++
++    ret  = get_cabac_bypass(&lc->cc) << 1;
++    ret |= get_cabac_bypass(&lc->cc);
++    return ret;
++}
++
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
++
++    if (i != 0) {
++        while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
++            i++;
++    }
++    return i;
++}
++
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
++{
++    if (nPbW + nPbH == 12)
++        return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++    if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
++        return PRED_BI;
++
++    return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
++}
++
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
++{
++    int i = 0;
++    int max = num_ref_idx_lx - 1;
++    int max_ctx = FFMIN(max, 2);
++
++    while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
++        i++;
++    if (i == 2) {
++        while (i < max && get_cabac_bypass(&lc->cc))
++            i++;
++    }
++
++    return i;
++}
++
++static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
++}
++
++static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
++}
++
++#if !USE_BY22
++static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
++{
++    int ret = 2;
++    int k = 1;
++
++    while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
++        ret += 1U << k;
++        k++;
++    }
++    if (k == CABAC_MAX_BIN) {
++        av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
++        return 0;
++    }
++
++    while (k--)
++        ret += get_cabac_bypass(&lc->cc) << k;
++    return get_cabac_bypass_sign(&lc->cc, -ret);
++}
++#endif
++
++static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return get_cabac_bypass_sign(&lc->cc, -1);
++}
++
++static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
++}
++
++static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
++{
++    return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
++}
++
++
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
++    int i =0;
++
++    while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
++        i++;
++
++    return i;
++}
++
++static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
++                                                   int log2_size, int *last_scx_prefix, int *last_scy_prefix)
++{
++    int i = 0;
++    int max = (log2_size << 1) - 1;
++    int ctx_offset, ctx_shift;
++
++    if (!c_idx_nz) {
++        ctx_offset = 3 * (log2_size - 2)  + ((log2_size - 1) >> 2);
++        ctx_shift = (log2_size + 1) >> 2;
++    } else {
++        ctx_offset = 15;
++        ctx_shift = log2_size - 2;
++    }
++    while (i < max &&
++           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        i++;
++    *last_scx_prefix = i;
++
++    i = 0;
++    while (i < max &&
++           GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
++        i++;
++    *last_scy_prefix = i;
++}
++
++static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
++                                                 int last_significant_coeff_prefix)
++{
++    int i;
++    int length = (last_significant_coeff_prefix >> 1) - 1;
++    int value = get_cabac_bypass(&lc->cc);
++
++    for (i = 1; i < length; i++)
++        value = (value << 1) | get_cabac_bypass(&lc->cc);
++    return value;
++}
++
++static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
++{
++    int inc;
++
++    inc = (ctx_cg != 0) + (c_idx_nz << 1);
++
++    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
++}
++
++static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
++{
++    return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
++}
++
++#if !USE_BY22
++#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
++#endif
++
++
++#ifndef coeff_abs_level_remaining_decode_bypass
++static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
++{
++    uint32_t y;
++    unsigned int prefix;
++    unsigned int last_coeff_abs_level_remaining;
++    unsigned int n;
++
++    y = get_cabac_by22_peek(c);
++    prefix = hevc_clz32(~y);
++    // y << prefix will always have top bit 0
++
++    if (prefix < 3) {
++        const unsigned int suffix = (y << prefix) >> (31 - rice_param);
++        last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
++        n = prefix + 1 + rice_param;
++    }
++    else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
++    {
++        const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
++
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix * 2 + rice_param - 2;
++    }
++    else {
++        unsigned int suffix;
++
++        get_cabac_by22_flush(c, prefix, y);
++        y = get_cabac_by22_peek(c);
++
++        suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
++        last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
++        n = prefix + rice_param - 2;
++    }
++
++    get_cabac_by22_flush(c, n, y);
++
++    return last_coeff_abs_level_remaining;
++}
++#endif
++
++static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
++{
++    int prefix = 0;
++    int suffix = 0;
++    int last_coeff_abs_level_remaining;
++    int i;
++
++    while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
++        prefix++;
++    if (prefix == CABAC_MAX_BIN) {
++//        av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
++        return 0;
++    }
++
++    if (prefix < 3) {
++        for (i = 0; i < rc_rice_param; i++)
++            suffix = (suffix << 1) | get_cabac_bypass(c);
++        last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
++    } else {
++        int prefix_minus3 = prefix - 3;
++        for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
++            suffix = (suffix << 1) | get_cabac_bypass(c);
++        last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
++                                              << rc_rice_param) + suffix;
++    }
++
++    return last_coeff_abs_level_remaining;
++}
++
++#if !USE_BY22
++#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
++static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
++{
++    unsigned int i;
++    uint32_t ret = 0;
++
++    for (i = 0; i < nb; i++)
++        ret = (ret << 1) | get_cabac_bypass(c);
++
++    return ret << (32 - nb);
++}
++#endif
++
++#ifndef coeff_sign_flag_decode_bypass
++static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
++{
++    uint32_t y;
++    y = get_cabac_by22_peek(c);
++    get_cabac_by22_flush(c, nb, y);
++    return y & ~(0xffffffffU >> nb);
++}
++#endif
++
++
++#ifndef get_cabac_greater1_bits
++static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
++    uint8_t * const state0)
++{
++    unsigned int i;
++    unsigned int rv = 0;
++    for (i = 0; i != n; ++i) {
++        const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
++        const unsigned int b = get_cabac(c, state0 + idx);
++        rv = (rv << 1) | b;
++    }
++    return rv;
++}
++#endif
++
++
++// N.B. levels returned are the values assuming coeff_abs_level_remaining
++// is uncoded, so 1 must be added if it is coded.  sum_abs also reflects
++// this version of events.
++static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
++    int * const pprev_subset_coded, int * const psum,
++    const unsigned int idx0_gt1, const unsigned int idx_gt2)
++{
++    CABACContext * const c = &lc->cc;
++    uint8_t * const state0 = lc->cabac_state + idx0_gt1;
++    uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
++    unsigned int rv;
++    unsigned int i;
++    const unsigned int n = FFMIN(n_end, 8);
++
++    // Really this is i != n but the simple unconditional loop is cheaper
++    // and faster
++    for (i = 0; i != 8; ++i)
++        levels[i] = 1;
++
++    rv = get_cabac_greater1_bits(c, n, state0);
++
++    *pprev_subset_coded = 0;
++    *psum = n;
++
++    rv <<= (32 - n);
++    if (rv != 0)
++    {
++        *pprev_subset_coded = 1;
++        *psum = n + 1;
++        i = hevc_clz32(rv);
++        levels[i] = 2;
++        if (get_cabac(c, state_gt2) == 0)
++        {
++            // Unset first coded bit
++            rv &= ~(0x80000000U >> i);
++        }
++    }
++
++    if (n_end > 8) {
++        const unsigned int g8 = n_end - 8;
++        rv |= ((1 << g8) - 1) << (24 - g8);
++        for (i = 0; i != g8; ++i) {
++            levels[i + 8] = 0;
++        }
++    }
++
++    return rv;
++}
++
++// extended_precision_processing_flag must be false given we are
++// putting the result into a 16-bit array
++// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
++// scale_m is uint8_t
++//
++// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
++//   or it can be 2 (if we have transquant_bypass)
++// shift is set to one less than we really want but would normally be
++//   s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
++// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
++// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
++// to achieve it
++
++#ifndef trans_scale_sat
++static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
++{
++    return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
++}
++#endif
++
++
++#ifndef update_rice
++static inline void update_rice(uint8_t * const stat_coeff,
++    const unsigned int last_coeff_abs_level_remaining,
++    const unsigned int c_rice_param)
++{
++    const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
++    if (x >= 6)
++        (*stat_coeff)++;
++    else if (x == 0 && *stat_coeff > 0)
++        (*stat_coeff)--;
++}
++#endif
++
++
++// n must be > 0 on entry
++#ifndef get_cabac_sig_coeff_flag_idxs
++static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t const * ctx_map,
++    uint8_t * p)
++{
++    do {
++        if (get_cabac(c, state0 + ctx_map[n]))
++            *p++ = n;
++    } while (--n != 0);
++    return p;
++}
++#endif
++
++
++static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
++    unsigned int n,
++    const uint8_t * ctx_map,  // const ptr here but not in asm
++    uint8_t * const flag_idx)
++{
++    int rv;
++
++    rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
++
++    return rv;
++}
++
++#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x1,  x2,  x3,\
++     x4,  x5,  x6,  x7,\
++     x8,  x9, x10, x11,\
++    x12, x13, x14, x15}
++
++#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x8, x12,\
++     x1,  x5,  x9, x13,\
++     x2,  x6, x10, x14,\
++     x3,  x7, x11, x15}
++
++#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
++     x0,  x4,  x1,  x8,\
++     x5,  x2, x12,  x9,\
++     x6,  x3, x13, x10,\
++     x7, x14, x11, x15}
++
++
++static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
++    uint8_t * const significant_coeff_group_flag,
++    const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
++    int * const pPrev_sig)
++{
++    while (--i >= 0) {
++        uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
++        const unsigned int x_cg = scan_x_cg[i];
++
++        // For the flag decode we only care about Z/NZ but
++        // we use the full Right * 2 + Down when calculating
++        // significant coeff flags so we obtain it here.
++        //
++        // The group flag array is one longer than it needs to
++        // be so we don't need to check for y_cg limits
++        const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
++
++        if (i == 0 ||
++            significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
++        {
++            gf_y[0] |= (1 << x_cg);
++            *pPrev_sig = prev_sig;
++            break;
++        }
++    }
++
++    return i;
++}
++
++static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;  // av_rpi_is_sand_frame(frame);
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
++
++        pc->type = RPI_PRED_ADD_RESIDUAL_C;
++    }
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        const int16_t dc = (int16_t)pc->dc.dc;  // Discard top bits
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride);
++
++        // Rewrite as add residual - must rewrite all fields as different union member
++        pc->type = RPI_PRED_ADD_RESIDUAL_V;
++        pc->ta.buf = coeffs;
++        pc->ta.dst = dst;
++        pc->ta.stride = stride;
++        pc->ta.dc = dc;
++    }
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
++
++        cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
++        cmd->size = log2_trafo_size;
++        cmd->ta.buf = coeffs;
++        cmd->ta.dst = dst;
++        cmd->ta.stride = stride;
++        cmd->ta.dc = 0;
++    }
++}
++
++
++static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++    const unsigned int log2_trafo_size, const unsigned int c_idx,
++    const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
++{
++    const AVFrame * const frame = s->frame;
++    const unsigned int stride = frame_stride1(s->frame, c_idx);
++    const unsigned int x = x0 >> ctx_hshift(s, c_idx);
++    const unsigned int y = y0 >> ctx_vshift(s, c_idx);
++    const int is_sliced = 1;
++    uint8_t * const dst = !is_sliced ?
++            s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
++        c_idx == 0 ?
++            av_rpi_sand_frame_pos_y(frame, x, y) :
++            av_rpi_sand_frame_pos_c(frame, x, y);
++
++    const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
++    const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
++
++    const unsigned int i = jb->intra.n;
++    HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
++
++    if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
++        pc->ta.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->ta.stride == stride);
++
++        pc->ta.dc = (int16_t)coeff;
++    }
++    else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
++        pc->dc.dst == dst)
++    {
++        av_assert1(pc->size == log2_trafo_size &&
++                   pc->c_idx == 1 &&
++                   pc->dc.stride == stride &&
++                   (pc->dc.dc & ~0xffff) == 0);
++
++        pc->dc.dc |= (coeff << 16);
++    }
++    else
++    {
++        HEVCPredCmd * const cmd = pc + 1;
++        jb->intra.n = i + 1;
++
++        cmd->type = RPI_PRED_ADD_DC + c_idx;
++        cmd->size = log2_trafo_size;
++        cmd->dc.dst = dst;
++        cmd->dc.stride = stride;
++        cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
++    }
++}
++
++
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int log2_trafo_size, const enum ScanType scan_idx,
++                                const int c_idx)
++{
++    int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
++
++    int last_significant_coeff_x, last_significant_coeff_y;
++    int num_coeff = 0;
++    int prev_subset_coded = 0;
++
++    int num_last_subset;
++    int x_cg_last_sig, y_cg_last_sig;
++
++    const uint8_t *scan_x_cg, *scan_y_cg;
++    const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
++
++    int use_vpu;
++#if RPI_COMPRESS_COEFFS                                
++    int num_nonzero = 0;
++    int use_compress = 0;
++    int *coeffs32;
++#endif
++    int use_dc = 0;
++    int16_t *coeffs;
++    uint8_t significant_coeff_group_flag[9] = {0};  // Allow 1 final byte that is always zero
++    int explicit_rdpcm_flag = 0;
++    int explicit_rdpcm_dir_flag;
++
++    int i;
++    int shift,scale;
++    const uint8_t *scale_matrix = NULL;
++    uint8_t dc_scale;
++    const int c_idx_nz = (c_idx != 0);
++    const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
++    int prev_sig = 0;
++    int may_hide_sign;
++
++    int16_t dummy_coeffs[16];
++
++    // Derive QP for dequant
++    if (!lc->cu.cu_transquant_bypass_flag) {
++        may_hide_sign = s->ps.pps->sign_data_hiding_flag;
++
++        if (s->ps.pps->transform_skip_enabled_flag &&
++            log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
++            int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
++            if (transform_skip_flag) {
++                trans_skip_or_bypass = 1;
++                if (lc->cu.pred_mode ==  MODE_INTRA  &&
++                    s->ps.sps->implicit_rdpcm_enabled_flag &&
++                    (pred_mode_intra == 10 || pred_mode_intra == 26)) {
++                    may_hide_sign = 0;
++                }
++            }
++        }
++
++        {
++            static const uint8_t level_scale[8] = {
++                40, 45, 51, 57, 64, 72, 0, 0  // Pad to 8
++            };
++            const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
++
++            // Shift is set to one less than will actually occur as the scale
++            // and saturate step adds 1 and then shifts right again
++            scale = level_scale[qp6 & 7];
++//            shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
++            shift = log2_trafo_size - (qp6 >> 3);
++
++            if (shift < 0) {
++                scale <<= -shift;
++                shift = 0;
++            }
++        }
++
++        if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
++            const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
++                &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
++            const unsigned int matrix_id =
++                lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
++
++            scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
++            dc_scale = scale_matrix[0];
++            if (log2_trafo_size >= 4)
++                dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
++        }
++        else
++        {
++            static const uint8_t sixteen_scale[64] = {
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16,
++                16, 16, 16, 16, 16, 16, 16, 16
++            };
++            scale_matrix = sixteen_scale;
++            dc_scale = 16;
++        }
++    } else {
++        static const uint8_t unit_scale[64] = {
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++            1, 1, 1, 1, 1, 1, 1, 1,
++        };
++        scale_matrix = unit_scale;
++        shift        = 0;
++        scale        = 2;  // We will shift right to kill this
++        dc_scale     = 1;
++
++        may_hide_sign = 0;
++    }
++
++
++
++
++    if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
++        trans_skip_or_bypass) {
++        explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
++        if (explicit_rdpcm_flag) {
++            may_hide_sign = 0;
++            explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
++        }
++    }
++
++    last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
++                                           &last_significant_coeff_x, &last_significant_coeff_y);
++
++    if (last_significant_coeff_x > 3) {
++        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
++        last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
++        (2 + (last_significant_coeff_x & 1)) +
++        suffix;
++    }
++
++    if (last_significant_coeff_y > 3) {
++        int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
++        last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
++        (2 + (last_significant_coeff_y & 1)) +
++        suffix;
++    }
++
++    if (scan_idx == SCAN_VERT)
++        FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
++
++    x_cg_last_sig = last_significant_coeff_x >> 2;
++    y_cg_last_sig = last_significant_coeff_y >> 2;
++
++    switch (scan_idx) {
++    case SCAN_DIAG: {
++        int last_x_c = last_significant_coeff_x & 3;
++        int last_y_c = last_significant_coeff_y & 3;
++
++        num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
++
++        switch (log2_trafo_size) {
++        case 2:
++            scan_x_cg = scan_1x1;
++            scan_y_cg = scan_1x1;
++            break;
++        case 3:
++            num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = diag_scan2x2_x;
++            scan_y_cg = diag_scan2x2_y;
++            break;
++        case 4:
++            num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
++            break;
++        case 5:
++        default:
++            num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
++            scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
++            scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
++            break;
++        }
++        break;
++    }
++    case SCAN_HORIZ:
++        scan_x_cg = horiz_scan2x2_x;
++        scan_y_cg = horiz_scan2x2_y;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
++        break;
++    default: //SCAN_VERT
++        scan_x_cg = horiz_scan2x2_y;
++        scan_y_cg = horiz_scan2x2_x;
++        num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
++        break;
++    }
++    num_coeff++;
++    num_last_subset = (num_coeff - 1) >> 4;
++
++    significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
++
++    {
++        const unsigned int ccount = 1 << (log2_trafo_size * 2);
++        const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */;  // These need special processing
++        use_vpu = 0;
++        use_dc = (num_coeff == 1) && !special &&
++            !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
++
++        if (use_dc) {
++            // Just need a little empty space
++            coeffs = dummy_coeffs;
++            // No need to clear
++        }
++        else
++        {
++            use_vpu = !special && log2_trafo_size >= 4;
++#if RPI_COMPRESS_COEFFS
++            use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
++#endif
++            coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
++#if RPI_COMPRESS_COEFFS
++            coeffs32 = (int*)coeffs;
++            if (!use_compress)
++#endif
++#if HAVE_NEON
++            rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
++#else
++            memset(coeffs, 0, ccount * sizeof(int16_t));
++#endif
++        }
++    }
++
++    i = num_last_subset;
++    do {
++        int implicit_non_zero_coeff = 0;
++        int n_end;
++
++        uint8_t significant_coeff_flag_idx[16];
++        unsigned int nb_significant_coeff_flag = 0;
++
++        if (i == num_last_subset) {
++            // First time through
++            int last_scan_pos = num_coeff - (i << 4) - 1;
++            n_end = last_scan_pos - 1;
++            significant_coeff_flag_idx[0] = last_scan_pos;
++            nb_significant_coeff_flag = 1;
++        } else {
++            n_end = 15;
++            implicit_non_zero_coeff = (i != 0);
++        }
++
++        if (n_end >= 0) {
++            static const uint8_t ctx_idx_maps_ts2[3][16] = {
++                D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
++                V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8)  // log2_trafo_size == 2
++            };
++            // N.B. prev_sig = Right * 2 + Down
++            static const uint8_t ctx_idx_maps[3][4][16] = {
++                {
++                    D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                },
++                {
++                    V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
++                    V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
++                    V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
++                    V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)  // prev_sig == 3, default
++                }
++            };
++            const uint8_t *ctx_idx_map_p;
++            int scf_offset = 0;
++
++            if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                ctx_idx_map_p = ctx_idx_maps[0][3];
++                scf_offset = 40 + c_idx_nz;
++            } else {
++                if (c_idx_nz != 0)
++                    scf_offset = 27;
++
++                if (log2_trafo_size == 2) {
++                    ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
++                } else {
++                    ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
++                    if (!c_idx_nz) {
++                        if (i != 0)
++                            scf_offset += 3;
++
++                        if (log2_trafo_size == 3) {
++                            scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
++                        } else {
++                            scf_offset += 21;
++                        }
++                    } else {
++                        if (log2_trafo_size == 3)
++                            scf_offset += 9;
++                        else
++                            scf_offset += 12;
++                    }
++                }
++            }
++
++            if (n_end > 0) {
++                int cnt = get_sig_coeff_flag_idxs(&lc->cc,
++                    lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
++                    n_end, ctx_idx_map_p,
++                    significant_coeff_flag_idx + nb_significant_coeff_flag);
++
++                nb_significant_coeff_flag += cnt;
++                if (cnt != 0) {
++                    implicit_non_zero_coeff = 0;
++                }
++            }
++
++            if (implicit_non_zero_coeff == 0) {
++                if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
++                    scf_offset = 42 + c_idx_nz;
++                } else {
++                    if (i == 0) {
++                        scf_offset = c_idx_nz ? 27 : 0;
++                    } else {
++                        scf_offset = 2 + scf_offset;
++                    }
++                }
++                if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
++                    significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                    nb_significant_coeff_flag++;
++                }
++            } else {
++                significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
++                nb_significant_coeff_flag++;
++            }
++        }
++#if RPI_COMPRESS_COEFFS
++        if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
++          int16_t temp[32*32];
++          const unsigned int ccount = 1 << (log2_trafo_size * 2);
++          lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
++          lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
++          memcpy(temp, coeffs, sizeof(int)*num_nonzero);
++          coeffs32 = (int *)temp;
++          memset(coeffs, 0, ccount * sizeof(int16_t));
++          num_nonzero--;
++          while (num_nonzero >= 0) {
++            const unsigned int res = coeffs32[num_nonzero];
++            const unsigned int offset = res & 0xffff;
++            coeffs[ offset ] = res >> 16;
++            num_nonzero--;
++          }
++          use_compress = 0;
++        }
++#endif            
++
++        if (nb_significant_coeff_flag != 0) {
++            const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
++                ((i != 0 && !c_idx_nz) ? 2 : 0) |
++                prev_subset_coded;
++            const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
++                (gt1_idx_delta << 2);
++            const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
++                gt1_idx_delta;
++
++            const unsigned int x_cg = scan_x_cg[i];
++            const unsigned int y_cg = scan_y_cg[i];
++            int16_t * const blk_coeffs = coeffs +
++                ((x_cg + (y_cg << log2_trafo_size)) << 2);
++            // This calculation is 'wrong' for log2_traffo_size == 2
++            // but that doesn't matter as in this case x_cg & y_cg
++            // are always 0 so result is correct (0) anyway
++            const uint8_t * const blk_scale = scale_matrix +
++                (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
++
++            // * The following code block doesn't deal with these flags:
++            //   (nor did the one it replaces)
++            //
++            // cabac_bypass_alignment_enabled_flag
++            //    This should be easy but I can't find a test case
++            // extended_precision_processing_flag
++            //    This can extend the required precision past 16bits
++            //    so is probably tricky - also no example found yet
++
++#if USE_N_END_1
++            if (nb_significant_coeff_flag == 1) {
++                // There is a small gain to be had from special casing the single
++                // transform coefficient case.  The reduction in complexity
++                // makes up for the code duplicatioon.
++
++                int trans_coeff_level = 1;
++                int coeff_sign_flag;
++                int coded_val = 0;
++
++                // initialize first elem of coeff_bas_level_greater1_flag
++                prev_subset_coded = 0;
++
++                if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
++                    trans_coeff_level = 2;
++                    prev_subset_coded = 1;
++                    coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
++                }
++
++                // Probably not worth the overhead of starting by22 for just one value
++                coeff_sign_flag = get_cabac_bypass(&lc->cc);
++
++                if (coded_val)
++                {
++                    if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
++                        trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
++                    } else {
++                        uint8_t * const stat_coeff =
++                            lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                        const unsigned int c_rice_param = *stat_coeff >> 2;
++                        const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
++
++                        trans_coeff_level = 3 + last_coeff_abs_level_remaining;
++                        update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                    }
++                }
++
++                {
++                    const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
++                    const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
++                    const unsigned int scale_m = blk_scale[xy_off->scale];
++                    const int res = trans_scale_sat(
++                        (trans_coeff_level ^ k) - k,  // Apply sign
++                        scale,
++                        i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
++                        shift);
++#if RPI_COMPRESS_COEFFS                                
++                      if (use_compress)
++                        coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++                      else
++#endif
++                      blk_coeffs[xy_off->coeff] = res;
++                }
++            }
++            else
++#endif
++            {
++                int sign_hidden = may_hide_sign;
++                int levels[16]; // Should be able to get away with int16_t but that fails some tests
++                uint32_t coeff_sign_flags;
++                uint32_t coded_vals = 0;
++                // Sum(abs(level[]))
++                // In fact we only need the bottom bit and in some future
++                // version that may be all we calculate
++                unsigned int sum_abs;
++
++                coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
++                    &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
++
++                if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
++                    sign_hidden = 0;
++
++                // -- Start bypass block
++
++                bypass_start(&lc->cc);
++
++                coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
++
++                if (coded_vals != 0)
++                {
++                    const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
++                    uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
++                        lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
++                    int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
++                    int * level = levels - 1;
++
++                    do {
++                        {
++                            const unsigned int z = hevc_clz32(coded_vals) + 1;
++                            level += z;
++                            coded_vals <<= z;
++                        }
++
++                        {
++                            const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
++                            const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
++
++                            sum_abs += last_coeff_abs_level_remaining + 1;
++                            *level = trans_coeff_level;
++
++                            if (stat_coeff != NULL)
++                                update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
++                            stat_coeff = NULL;
++
++                            if (trans_coeff_level > (3 << c_rice_param) &&
++                                (c_rice_param < 4 || rice_adaptation_enabled))
++                                ++c_rice_param;
++                        }
++                    } while (coded_vals != 0);
++                }
++
++                // sign_hidden = 0 or 1 so we can combine the tests
++                if ((sign_hidden & sum_abs) != 0) {
++                    levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
++                }
++
++                bypass_finish(&lc->cc);
++
++                // -- Finish bypass block
++
++                // Scale loop
++                {
++                    int m = nb_significant_coeff_flag - 1;
++
++                    // Deal with DC component (if any) first
++                    if (i == 0 && significant_coeff_flag_idx[m] == 0)
++                    {
++                        const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++                        const int res = trans_scale_sat(
++                            (levels[m] ^ k) - k, scale, dc_scale, shift);
++#if RPI_COMPRESS_COEFFS
++                        if (use_compress)
++                        {
++                            coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
++                        }
++                        else
++#endif
++                        {
++                            blk_coeffs[0] = res;
++                        }
++                        --m;
++                    }
++
++#if !USE_N_END_1
++                    // If N_END_1 set then m was at least 1 initially
++                    if (m >= 0)
++#endif
++                    {
++                        do {
++                            const xy_off_t * const xy_off = scan_xy_off +
++                                significant_coeff_flag_idx[m];
++                            const int k = (int32_t)(coeff_sign_flags << m) >> 31;
++                            const int res = trans_scale_sat(
++                                (levels[m] ^ k) - k,
++                                scale,
++                                blk_scale[xy_off->scale],
++                                shift);
++#if RPI_COMPRESS_COEFFS
++                            if (use_compress) {
++                              coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
++                            } else
++#endif
++                              blk_coeffs[xy_off->coeff] = res;
++                        } while (--m >= 0);
++                    }
++                }
++
++            }
++        }
++    } while ((i = next_subset(lc, i, c_idx_nz,
++                              significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
++             !cabac_overflow(&lc->cc));
++
++    if (lc->cu.cu_transquant_bypass_flag) {
++        if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                    (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++            int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
++
++            s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++        }
++    } else {
++        if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
++            int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
++                      log2_trafo_size == 2 &&
++                      lc->cu.pred_mode == MODE_INTRA;
++            if (rot) {
++                for (i = 0; i < 8; i++)
++                    FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
++            }
++
++            s->hevcdsp.dequant(coeffs, log2_trafo_size);
++
++            if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
++                                        lc->cu.pred_mode == MODE_INTRA &&
++                                        (pred_mode_intra == 10 || pred_mode_intra == 26))) {
++                int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
++
++                s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
++            }
++        } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
++            s->hevcdsp.transform_4x4_luma(coeffs);
++        }
++        else if (!use_vpu)
++        {
++            int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
++            if (max_xy == 0)
++            {
++                if (use_dc)
++                    rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++                else
++                    s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
++            }
++            else {
++                int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
++                if (max_xy < 4)
++                    col_limit = FFMIN(4, col_limit);
++                else if (max_xy < 8)
++                    col_limit = FFMIN(8, col_limit);
++                else if (max_xy < 12)
++                    col_limit = FFMIN(24, col_limit);
++                s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
++            }
++        }
++    }
++
++#if 0
++    // Mildly rotted - we support no mode where cross is valid
++    if (lc->tu.cross_pf) {
++        int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
++        const int ccount = 1 << (log2_trafo_size * 2);
++
++        for (i = 0; i < ccount; i++) {
++            coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
++        }
++    }
++#endif
++
++    if (!use_dc) {
++#if RPI_COMPRESS_COEFFS                                
++        if (use_compress) {
++          coeffs32[num_nonzero] = 0;
++        }
++#endif      
++        rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
++    }
++}
++
++#if !USE_BY22
++// Stores results to lc
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++    int x = abs_mvd_greater0_flag_decode(lc);
++    int y = abs_mvd_greater0_flag_decode(lc);
++
++    if (x)
++        x += abs_mvd_greater1_flag_decode(lc);
++    if (y)
++        y += abs_mvd_greater1_flag_decode(lc);
++
++    switch (x) {
++    case 2: x = mvd_decode(lc);           break;
++    case 1: x = mvd_sign_flag_decode(lc); break;
++    case 0: x = 0;                       break;
++    }
++
++    switch (y) {
++    case 2: y = mvd_decode(lc);           break;
++    case 1: y = mvd_sign_flag_decode(lc); break;
++    case 0: y = 0;                       break;
++    }
++    return MV_XY(x,y);
++}
++#else
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
++{
++    int x = abs_mvd_greater0_flag_decode(lc);
++    int y = abs_mvd_greater0_flag_decode(lc);
++
++    if ((x | y) == 0)
++        return 0;
++
++    if (x != 0)
++        x += abs_mvd_greater1_flag_decode(lc);
++    if (y != 0)
++        y += abs_mvd_greater1_flag_decode(lc);
++
++    if ((x | y) == 1)
++    {
++        // Not worth starting BY22
++        if (x != 0)
++            x = mvd_sign_flag_decode(lc);
++        if (y != 0)
++            y = mvd_sign_flag_decode(lc);
++    }
++    else
++    {
++        CABACContext * const cc = &lc->cc;
++        uint32_t val;
++        uint32_t b;
++        unsigned int n = 0;
++
++        bypass_start(cc);
++        b = val = get_cabac_by22_peek(cc);
++
++        if (x == 1) {
++            x = ((int32_t)b >> 31) | 1;
++            n = 1;
++            b <<= 1;
++        }
++        else if (x == 2) {
++            // EG1 so we have (leading one bits + 1) of suffix
++            // This makes prefix & suffix lengths the same
++            const unsigned int k = hevc_clz32(~b) + 1;
++            int s;
++
++            av_assert2(k <= 15);
++
++            b <<= k;
++            n = 2 * k + 1; // Includes suffix & sign
++
++            // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
++            // if we are going to do this without a flush
++            if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
++            {
++                // Need too many bits - flush
++                // n = k
++                get_cabac_by22_flush(cc, k, val);
++                b = val = get_cabac_by22_peek(cc);
++                n = k + 1;
++            }
++
++            x = (b >> (32 - k)) + (1 << k);
++            b <<= k;
++            s = (int32_t)b >> 31;
++            x = (x ^ s) - s;
++            b <<= 1;
++
++            // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
++            if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
++            {
++                get_cabac_by22_flush(cc, n, val);
++                b = val = get_cabac_by22_peek(cc);
++                n = 0;
++            }
++        }
++
++        if (y == 1) {
++            y = ((int32_t)b >> 31) | 1;
++            ++n;
++            // don't care about b anymore
++        }
++        else if (y == 2) {
++            const unsigned int k = hevc_clz32(~b) + 1;
++            int s;
++
++            av_assert2(k <= 15);
++
++            // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
++            // if we are going to do this without a flush
++            b <<= k;
++            n += 2 * k + 1;
++
++            if (n > CABAC_BY22_PEEK_BITS)
++            {
++                // Need too many bits - flush
++                get_cabac_by22_flush(cc, n - (k + 1), val);
++                b = val = get_cabac_by22_peek(cc);
++                n = k + 1;
++            }
++
++            y = (b >> (32 - k)) + (1 << k);
++            s = (int32_t)(b << k) >> 31;
++            y = (y ^ s) - s;
++            // don't care about b anymore
++        }
++
++        get_cabac_by22_flush(cc, n, val);
++        bypass_finish(cc);
++    }
++
++    return MV_XY(x, y);
++}
++#endif
+diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
+new file mode 100644
+index 0000000000..ca191f00d9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_cabac_fns.h
+@@ -0,0 +1,217 @@
++/*
++ * HEVC CABAC decoding
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2018 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
++#define AVCODEC_RPI_HEVC_CABAC_FNS_H
++
++#include "config.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
++int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
++int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
++int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
++int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
++
++//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
++void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const int x0, const int y0,
++                                const int log2_trafo_size, const enum ScanType scan_idx,
++                                const int c_idx);
++
++MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
++int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
++
++#define HEVC_BIN_SAO_MERGE_FLAG                         0
++#define HEVC_BIN_SAO_TYPE_IDX                           1
++#define HEVC_BIN_SAO_EO_CLASS                           2
++#define HEVC_BIN_SAO_BAND_POSITION                      2
++#define HEVC_BIN_SAO_OFFSET_ABS                         2
++#define HEVC_BIN_SAO_OFFSET_SIGN                        2
++#define HEVC_BIN_END_OF_SLICE_FLAG                      2
++#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG                 2
++#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG              5
++#define HEVC_BIN_SKIP_FLAG                              6
++#define HEVC_BIN_CU_QP_DELTA                            9
++#define HEVC_BIN_PRED_MODE                              12
++#define HEVC_BIN_PART_MODE                              13
++#define HEVC_BIN_PCM_FLAG                               17
++#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE              17
++#define HEVC_BIN_MPM_IDX                                18
++#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE               18
++#define HEVC_BIN_INTRA_CHROMA_PRED_MODE                 18
++#define HEVC_BIN_MERGE_FLAG                             20
++#define HEVC_BIN_MERGE_IDX                              21
++#define HEVC_BIN_INTER_PRED_IDC                         22
++#define HEVC_BIN_REF_IDX_L0                             27
++#define HEVC_BIN_REF_IDX_L1                             29
++#define HEVC_BIN_ABS_MVD_GREATER0_FLAG                  31
++#define HEVC_BIN_ABS_MVD_GREATER1_FLAG                  33
++#define HEVC_BIN_ABS_MVD_MINUS2                         35
++#define HEVC_BIN_MVD_SIGN_FLAG                          35
++#define HEVC_BIN_MVP_LX_FLAG                            35
++#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG                  36
++#define HEVC_BIN_SPLIT_TRANSFORM_FLAG                   37
++#define HEVC_BIN_CBF_LUMA                               40
++#define HEVC_BIN_CBF_CB_CR                              42
++#define HEVC_BIN_TRANSFORM_SKIP_FLAG                    46
++#define HEVC_BIN_EXPLICIT_RDPCM_FLAG                    48
++#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG                50
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX        52
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX        70
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX        88
++#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX        88
++#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG           88
++#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG                 92
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG          136
++#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG          160
++#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING              166
++#define HEVC_BIN_COEFF_SIGN_FLAG                        166
++#define HEVC_BIN_LOG2_RES_SCALE_ABS                     166
++#define HEVC_BIN_RES_SCALE_SIGN_FLAG                    174
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG               176
++#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX                177
++
++
++int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
++int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
++
++static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
++    const uint8_t *ptr = c->bytestream;
++
++    if (c->low & 0x1)
++        ptr--;
++#if CABAC_BITS == 16
++    if (c->low & 0x1FF)
++        ptr--;
++#endif
++    if ((int) (c->bytestream_end - ptr) < n)
++        return NULL;
++    if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
++        return NULL;
++
++    return ptr;
++}
++
++static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
++}
++
++static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
++}
++
++static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                                            const unsigned int ct_depth,
++                                                            const unsigned int x0, const unsigned int y0)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
++                                 ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
++                                 ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
++}
++
++static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                             const int x0, const int y0, const int x_cb, const int y_cb)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
++                                 (s->cabac_stash_left[y0 >> 3] & 1) +
++                                 (s->cabac_stash_up[x0 >> 3] & 1));
++}
++
++static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++}
++
++static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
++}
++
++static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
++}
++
++static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
++}
++
++static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
++}
++
++static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
++}
++
++static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
++}
++
++static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
++}
++
++static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
++{
++    return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
++}
++
++
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
+new file mode 100644
+index 0000000000..341bb77d9d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.c
+@@ -0,0 +1,75 @@
++/*
++ * HEVC shared tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include <stdint.h>
++
++#include "rpi_hevc_data.h"
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 1, 2,
++    3, 2, 3, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 3, 2,
++    1, 3, 2, 3,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
++    0, 0, 1, 0,
++    1, 2, 0, 1,
++    2, 3, 0, 1,
++    2, 3, 4, 0,
++    1, 2, 3, 4,
++    5, 0, 1, 2,
++    3, 4, 5, 6,
++    0, 1, 2, 3,
++    4, 5, 6, 7,
++    1, 2, 3, 4,
++    5, 6, 7, 2,
++    3, 4, 5, 6,
++    7, 3, 4, 5,
++    6, 7, 4, 5,
++    6, 7, 5, 6,
++    7, 6, 7, 7,
++};
++
++const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
++    0, 1, 0, 2,
++    1, 0, 3, 2,
++    1, 0, 4, 3,
++    2, 1, 0, 5,
++    4, 3, 2, 1,
++    0, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 0,
++    7, 6, 5, 4,
++    3, 2, 1, 7,
++    6, 5, 4, 3,
++    2, 7, 6, 5,
++    4, 3, 7, 6,
++    5, 4, 7, 6,
++    5, 7, 6, 7,
++};
+diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
+new file mode 100644
+index 0000000000..0aee673d8b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_data.h
+@@ -0,0 +1,31 @@
++/*
++ * HEVC shared data tables
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_DATA_H
++#define AVCODEC_RPI_HEVC_DATA_H
++
++#include <stdint.h>
++
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
++extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
++extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
++
++#endif /* AVCODEC_RPI_HEVC_DATA_H */
+diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
+new file mode 100644
+index 0000000000..5125d1eb6b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_filter.c
+@@ -0,0 +1,1210 @@
++/*
++ * HEVC video decoder
++ *
++ * Originally by:
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Seppo Tomperi
++ * Copyright (C) 2013 Wassim Hamidouche
++ *
++ * Substantially rewritten:
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++//#define DISABLE_SAO
++//#define DISABLE_DEBLOCK
++//#define DISABLE_STRENGTHS
++// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
++//#define DISABLE_DEBLOCK_NONREF
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++
++#include "rpi_qpu.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#define LUMA 0
++#define CB 1
++#define CR 2
++
++// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
++// so -12,75 overall
++static const uint8_t tctablex[] = {
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 1, // QP  0...18
++    1, 1, 1, 1, 1, 1, 1,  1,  2,  2,  2,  2,  3,  3,  3,  3, 4, 4, 4, // QP 19...37
++    5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24,          // QP 38...53
++    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24                    // 54..75
++};
++#define tctable (tctablex + 12 + 6*8)
++
++static const uint8_t betatablex[] = {
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  // -ve quant padding
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,
++
++    0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,                          // -12..-1
++     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  7,  8, // QP 0...18
++     9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
++    38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64,                      // QP 38...51
++    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64                    // 52..73
++};
++#define betatable (betatablex + 12 + 6*8)
++
++static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
++                            const int c_idx, const int tc_offset)
++{
++    return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
++}
++
++static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int xBase, const unsigned int yBase)
++{
++    const unsigned int ctb_size_mask        = (1 << s->ps.sps->log2_ctb_size) - 1;
++    const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
++    const unsigned int xQgBase              = xBase & MinCuQpDeltaSizeMask;
++    const unsigned int yQgBase              = yBase & MinCuQpDeltaSizeMask;
++    const unsigned int min_cb_width         = s->ps.sps->min_cb_width;
++    const unsigned int x_cb                 = xQgBase >> s->ps.sps->log2_min_cb_size;
++    const unsigned int y_cb                 = yQgBase >> s->ps.sps->log2_min_cb_size;
++    const int qPy_pred = lc->qPy_pred;
++
++    return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
++             s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
++            ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
++             s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
++}
++
++// * Only called from bitstream decode in foreground
++//   so should be safe
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
++{
++    const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
++
++    if (lc->tu.cu_qp_delta != 0) {
++        // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
++        int off = s->ps.sps->qp_bd_offset;
++        lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
++                                 52 + off) - off;
++    } else
++        lc->qp_y = qp_y;
++}
++
++static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
++{
++    return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
++}
++
++// "DSP" these?
++static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
++{
++    switch (pixel_shift)
++    {
++        case 2:
++            *(uint32_t *)dst = *(uint32_t *)src;
++            break;
++        case 1:
++            *(uint16_t *)dst = *(uint16_t *)src;
++            break;
++        default:
++            *dst = *src;
++            break;
++    }
++}
++
++static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
++                           ptrdiff_t stride_src, int x, int y, int width, int height,
++                           int c_idx, int x_ctb, int y_ctb)
++{
++    const unsigned int sh = pixel_shift(s, c_idx);
++    const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
++    const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
++
++    /* copy horizontal edges */
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
++        src, width << sh);
++    memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
++        src + stride_src * (height - 1), width << sh);
++
++    /* copy vertical edges */
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
++
++    ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
++}
++
++// N.B. Src & dst are swapped as this is a restore!
++// x0 & y0 are in luma coords
++// Width & height are in Y/C pels as appropriate
++// * Clear scope for optimsation here but not used enough to be worth it
++static void restore_tqb_pixels(const HEVCRpiContext * const s,
++                               uint8_t *src1, const uint8_t *dst1,
++                               const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int width, const int height,
++                               const int c_idx)
++{
++    if (s->ps.pps->transquant_bypass_enable_flag ||
++        s->ps.sps->pcm.loop_filter_disable_flag)
++    {
++        const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
++        int blks_y = height >> (c_idx == 0 ? 3 : 2);
++        const unsigned int bwidth = 8 << s->ps.sps->pixel_shift;  // Y & C have the same width in sand
++        const unsigned int bheight = (c_idx == 0) ? 8 : 4;
++        const unsigned int sh = ((x0 >> 3) & 7);
++        const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
++
++        do {
++            unsigned int m = (*pcm >> sh) & mask;
++            uint8_t * bd = src1;
++            const uint8_t * bs = dst1;
++            while (m != 0) {
++                if ((m & 1) != 0) {
++                    s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
++                }
++                m >>= 1;
++                bs += bwidth;
++                bd += bwidth;
++            }
++            src1 += stride_src * bheight;
++            dst1 += stride_dst * bheight;
++            pcm += s->ps.sps->pcm_width;
++        } while (--blks_y > 0);
++    }
++}
++
++#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
++
++static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
++{
++#if SAO_FILTER_N == 5
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#elif SAO_FILTER_N == 6
++    static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
++#else
++#error Confused by size of sao fn array
++#endif
++    int c_idx;
++    int edges[4];  // 0 left 1 top 2 right 3 bottom
++    int x_ctb                = x >> s->ps.sps->log2_ctb_size;
++    int y_ctb                = y >> s->ps.sps->log2_ctb_size;
++    int ctb_addr_rs          = y_ctb * s->ps.sps->ctb_width + x_ctb;
++    int ctb_addr_ts          = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
++    RpiSAOParams *sao           = &CTB(s->sao, x_ctb, y_ctb);
++    // flags indicating unfilterable edges
++    uint8_t vert_edge[]      = { 0, 0 };
++    uint8_t horiz_edge[]     = { 0, 0 };
++    uint8_t diag_edge[]      = { 0, 0, 0, 0 };
++    uint8_t lfase            = CTB(s->filter_slice_edges, x_ctb, y_ctb);
++    uint8_t no_tile_filter   = s->ps.pps->tiles_enabled_flag &&
++                               !s->ps.pps->loop_filter_across_tiles_enabled_flag;
++    uint8_t restore          = no_tile_filter || !lfase;
++    uint8_t left_tile_edge   = 0;
++    uint8_t right_tile_edge  = 0;
++    uint8_t up_tile_edge     = 0;
++    uint8_t bottom_tile_edge = 0;
++    const int sliced = 1;
++    const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
++
++    edges[0]   = x_ctb == 0;
++    edges[1]   = y_ctb == 0;
++    edges[2]   = x_ctb == s->ps.sps->ctb_width  - 1;
++    edges[3]   = y_ctb == s->ps.sps->ctb_height - 1;
++
++#ifdef DISABLE_SAO
++    return;
++#endif
++
++    if (restore) {
++        if (!edges[0]) {
++            left_tile_edge  = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
++            vert_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
++        }
++        if (!edges[2]) {
++            right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
++            vert_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
++        }
++        if (!edges[1]) {
++            up_tile_edge     = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
++            horiz_edge[0]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
++        }
++        if (!edges[3]) {
++            bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
++            horiz_edge[1]    = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[1]) {
++            diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
++        }
++        if (!edges[1] && !edges[2]) {
++            diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
++        }
++        if (!edges[2] && !edges[3]) {
++            diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
++        }
++        if (!edges[0] && !edges[3]) {
++            diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
++        }
++    }
++
++    for (c_idx = 0; c_idx < plane_count; c_idx++) {
++        const unsigned int vshift = ctx_vshift(s, c_idx);
++        const unsigned int hshift = ctx_hshift(s, c_idx);
++        const int x0 = x >> hshift;
++        const int y0 = y >> vshift;
++        const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
++        const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
++        const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
++        const int width    = FFMIN(ctb_size_h, (s->ps.sps->width  >> hshift) - x0);
++        const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
++        int tab      = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
++        ptrdiff_t stride_dst;
++        uint8_t *dst;
++
++        const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
++        const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
++        uint8_t * const src = !sliced ?
++                &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0, y0);
++        const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
++            !sliced ? src - (1 << sh) :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
++        const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
++            !sliced ? src + (width << sh) :
++            c_idx == 0 ?
++                av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
++                av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
++
++        if (sliced && c_idx > 1) {
++            break;
++        }
++
++//        if (c_idx == 1)
++//            printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
++
++        switch (sao->type_idx[c_idx]) {
++        case SAO_BAND:
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
++            if (s->ps.pps->transquant_bypass_enable_flag ||
++                s->ps.sps->pcm.loop_filter_disable_flag)
++            {
++                // Can't use the edge buffer here as it may be in use by the foreground
++                DECLARE_ALIGNED(64, uint8_t, dstbuf)
++                    [2*MAX_PB_SIZE*MAX_PB_SIZE];
++                dst = dstbuf;
++                stride_dst = 2*MAX_PB_SIZE;
++                s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++                restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                                   x, y, width, height, c_idx);
++            } else {
++                if (sliced && c_idx != 0)
++                {
++                    s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[1], sao->band_position[1],
++                                                    sao->offset_val[2], sao->band_position[2],
++                                                    width, height);
++                }
++                else
++                {
++                    s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
++                                                    sao->offset_val[c_idx], sao->band_position[c_idx],
++                                                    width, height);
++                }
++            }
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        case SAO_EDGE:
++        {
++            const int w = s->ps.sps->width >> hshift;
++            const int h = s->ps.sps->height >> vshift;
++            int top_edge = edges[1];
++            int bottom_edge = edges[3];
++            // Can't use the edge buffer here as it may be in use by the foreground
++            DECLARE_ALIGNED(64, uint8_t, dstbuf)
++                [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
++
++            stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
++            dst = dstbuf + stride_dst + 32;
++
++            if (!top_edge) {
++                uint8_t *dst1;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
++
++                dst1 = dst - stride_dst;
++
++                if (src_l != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
++                }
++
++                src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
++                           SAO_APPLIED);
++                memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
++
++                if (src_r != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
++                }
++            }
++            if (!bottom_edge) {
++                uint8_t * const dst1 = dst + height * stride_dst;
++                int src_idx;
++                const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
++                const unsigned int hoff = height * stride_src;
++
++                if (src_l != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
++                }
++
++                src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
++                           SAO_APPLIED);
++                memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
++
++                if (src_r != NULL) {
++                    src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
++                               SAO_APPLIED);
++                    copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
++                }
++            }
++            if (src_l != NULL) {
++                if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
++                    ff_hevc_rpi_copy_vert(dst - (1 << sh),
++                              src_l,
++                              sh, height, stride_dst, stride_src);
++                }
++            }
++            if (src_r != NULL) {
++                if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
++                              s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
++                              sh, height, stride_dst, 1 << sh);
++                } else {
++                    ff_hevc_rpi_copy_vert(dst + (width << sh),
++                              src_r,
++                              sh, height, stride_dst, stride_src);
++                }
++            }
++
++            s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
++
++            copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
++                           x_ctb, y_ctb);
++            if (sliced && c_idx != 0)
++            {
++                // Class always the same for both U & V (which is just as well :-))
++                s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
++                                                sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
++                                                width, height);
++                s->hevcdsp.sao_edge_restore_c[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            else
++            {
++                s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
++                                                sao->eo_class[c_idx], width, height);
++                s->hevcdsp.sao_edge_restore[restore](src, dst,
++                                                    stride_src, stride_dst,
++                                                    sao,
++                                                    edges, width,
++                                                    height, c_idx,
++                                                    vert_edge,
++                                                    horiz_edge,
++                                                    diag_edge);
++            }
++            restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
++                               x, y, width, height, c_idx);
++            sao->type_idx[c_idx] = SAO_APPLIED;
++            break;
++        }
++        }
++    }
++
++#if RPI_ZC_SAND_8_IN_10_BUF
++    if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
++        (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
++    {
++        const unsigned int stride1 = frame_stride1(s->frame, 1);
++        const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
++        const unsigned int xoff = (x >> 8) * stride2 * stride1;
++        const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++        const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
++        uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
++        const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
++        uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
++        const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
++        const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
++
++//        printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
++        av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
++        av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
++    }
++#endif
++}
++
++// When bits are delivered to deblock we want them
++//#define TL 1
++//#define TR 2
++//#define BL 4
++//#define BR 8
++
++// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
++// so we need to rearrange before passing on
++
++static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++    return (pcm[0] |
++        (pcm[1] << 8) |
++        (pcm[s->ps.sps->pcm_width] << 16) |
++        (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
++}
++
++static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
++    return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
++}
++
++// We cast away const here as we want this to work for both get and set
++static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++    return (uint32_t *)(bs +
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#warning Unexpected masks
++        // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
++        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++            (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
++{
++    return (uint8_t *)(bs +
++        ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
++            (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
++        ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
++        (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
++}
++
++
++// Get block strength
++// Given how we call we will always get within the 32bit boundries
++static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
++                                unsigned int xl, unsigned int xr, const unsigned int y)
++{
++    if (xr <= xl) {
++        return 0;
++    }
++    else
++    {
++#if HAVE_ARMV6T2_INLINE
++#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
++#error This case not yet handled in bs_get32
++#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
++#error Stride1 < return size
++#endif
++        uint32_t tmp;
++        __asm__ (
++            "lsr         %[tmp], %[xl], %[xl_shift]                  \n\t"
++            "rsb         %[xr], %[xl], %[xr]                         \n\t"
++            "mla         %[stride2], %[stride2], %[tmp], %[bs]       \n\t"
++            "add         %[xr], %[xr], #7                            \n\t"
++            "lsr         %[bs], %[y], %[y_shift1]                    \n\t"
++            "bic         %[xr], %[xr], #7                            \n\t"
++            "ubfx        %[xl], %[xl], #1, #5                        \n\t"
++            "lsr         %[xr], %[xr], #1                            \n\t"
++            "cmp         %[xr], #32                                  \n\t"
++            "mvn         %[tmp], #0                                  \n\t"
++            "ldr         %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
++            "lsl         %[tmp], %[tmp], %[xr]                       \n\t"
++            "lsr         %[xl], %[bs], %[xl]                         \n\t"
++            "it ne                                                   \n\t"
++            "bicne       %[bs], %[xl], %[tmp]                        \n\t"
++            :  // Outputs
++                      [bs]"+r"(bs),
++                 [stride2]"+r"(stride2),
++                      [xl]"+r"(xl),
++                      [xr]"+r"(xr),
++                     [tmp]"=&r"(tmp)
++            :  // Inputs
++                       [y]"r"(y),
++                [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
++                [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
++                [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++            :  // Clobbers
++                "cc"
++        );
++        return (uint32_t) bs;
++#else
++        const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
++        const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
++
++        return n == 32 ? a :
++            (a >> ((xl >> 1) & 31)) & ~(~0U << n);
++#endif
++    }
++}
++
++static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++    return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
++}
++
++static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
++{
++    av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
++    return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
++}
++
++
++static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  1);
++    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++    const DBParams * cb_dbp = s->deblock + ctb_n;
++    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++
++    unsigned int cb_x;
++
++    // Do in CTB-shaped blocks
++    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
++    {
++        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++        const unsigned int bv_l = FFMAX(cb_x, 8);
++        const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
++        const unsigned int bh_l = bv_l - 8;
++        unsigned int y;
++
++        // Main body
++        for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
++        {
++            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
++
++            const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
++            const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++            const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++            if (vbs != 0)
++            {
++                const uint8_t * const tcv = tctable + dbp->tc_offset;
++                const uint8_t * const betav = betatable + dbp->beta_offset;
++                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++                unsigned int x;
++
++                for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
++                {
++                    if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
++                    {
++                        const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                        s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                         frame_stride1(s->frame, LUMA),
++                                                         betav[qp],
++                                                         ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
++                                                          (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
++                                                         pcmfa & 3,
++                                                         av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
++                    }
++                }
++            }
++
++            if (y != 0)
++            {
++                uint32_t hbs;
++
++                // H left - mostly separated out so we only need a uint32_t hbs
++                if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
++                {
++                    const unsigned int x = bh_l;
++                    const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++                    const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                    const DBParams * const dbph = dbp - 1;
++                    const uint8_t * const tc = tctable + dbph->tc_offset + qp;
++
++                    av_assert2(cb_x - bh_l == 8);
++
++                    s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                         frame_stride1(s->frame, LUMA),
++                                                         betatable[qp + dbph->beta_offset],
++                                                         ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++                                                            (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++                                                         (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++                }
++
++                // H
++                if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0)  // Will give (x <= bh_r) in for loop
++                {
++                    unsigned int x;
++                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);
++
++                    for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
++                    {
++                        if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
++                        {
++                            const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                            const uint8_t * const tc = tctable + dbp->tc_offset + qp;
++                            s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
++                                                                frame_stride1(s->frame, LUMA),
++                                                                betatable[qp + dbp->beta_offset],
++                                                                ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
++                                                                   (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
++                                                                (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
++                        }
++                    }
++                }
++            }
++
++        }
++    }
++}
++
++static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
++{
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++    return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
++}
++
++static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int log2_min_cb_size  = s->ps.sps->log2_min_cb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    const unsigned int cb_r = bounds.x + bounds.w - (end_x ? 0 :  8);
++    const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
++    const DBParams * dbp = s->deblock + ctb_n;
++    const unsigned int b_b = bounds.y + bounds.h - (end_y ? 0 : 8);
++    const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
++    const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
++
++    unsigned int cb_x;
++
++    av_assert1((bounds.x & (ctb_size - 1)) == 0);
++    av_assert1((bounds.y & (ctb_size - 1)) == 0);
++    av_assert1(bounds.h <= ctb_size);
++
++    // Do in CTB-shaped blocks
++    for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
++        const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
++        const unsigned int bv_l = FFMAX(cb_x, 16);
++        unsigned int y;
++
++        // V above
++        if (bounds.y != 0) {
++            // Deblock V up 8
++            // CTB above current
++            // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
++            const unsigned int y = bounds.y - 8;
++            uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
++
++            if (vbs != 0)
++            {
++                unsigned int pcmfa = pcm2(s, bv_l - 1, y);
++                const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
++                unsigned int x;
++
++                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++                {
++                    if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
++                    {
++                        const int qp0 = q2h(s, x, y);
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                                       frame_stride1(s->frame, 1),
++                                                       tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
++                                                       av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                                                       pcmfa & 3);
++                    }
++                }
++            }
++        }
++
++        for (y = bounds.y; y < b_b; y += 16)
++        {
++            uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
++                (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
++
++            // V
++            if (vbs != 0)
++            {
++                unsigned int x;
++                unsigned int pcmfa =
++                    (y + 16 > b_b ?
++                        pcm2(s, bv_l - 1, y) | 0xffff0000 :
++                        pcm4(s, bv_l - 1, y));
++                const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++                for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
++                {
++                    if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++                    {
++                        const int qp0 = q2h(s, x, y);
++                        const int qp1 = q2h(s, x, y + 8);
++                        s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                            frame_stride1(s->frame, 1),
++                            ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                            av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
++                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                    }
++                }
++            }
++
++            // H
++            if (y != 0)
++            {
++                uint32_t hbs;
++                const unsigned int bh_l = bv_l - 16;
++                const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
++                const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++                const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
++
++                // H left - mostly separated out so we only need a uint32_t hbs
++                // Stub is width 8 to the left of bounds, but width 16 internally
++                if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
++                {
++                    unsigned int pcmfa = pcm4(s, bh_l, y - 1);
++
++                    // Chop off bits we don't want...
++                    if (bh_l < bounds.x) {
++                        pcmfa |= 0x10001; // TL|BL pre rearrangement
++                        hbs &= ~3;  // Make BS 0
++                    }
++
++                    // Double check we still want this
++                    if (hbs != 0 && (~pcmfa & 0x30003) != 0)
++                    {
++                        const unsigned int x = bh_l;
++                        const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                        const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++                        const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
++
++                        s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                            frame_stride1(s->frame, 1),
++                            ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                            (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                    }
++                }
++
++                // H main
++                if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
++                {
++                    unsigned int x;
++                    unsigned int pcmfa = pcm4(s, cb_x, y - 1);  // Might like to mask out far right writes but probably not worth it
++
++                    for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
++                    {
++                        if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
++                        {
++                            const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
++                            const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
++                            const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
++
++                            s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
++                                frame_stride1(s->frame, 1),
++                                ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
++                                    ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
++                                (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
++                        }
++                    }
++                }
++            }
++        }
++    }
++}
++
++static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
++{
++    return x & ~(~0U << log2_n);
++}
++
++static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++    av_assert2((y & 7) == 0);
++
++    // This doesn't have the same simultainious update issues that bsf_stash
++    // does (other threads will have a different y) so we can do it the easy way
++    if ((bsf &= mask) != 0)
++        *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
++}
++
++
++static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
++{
++    // We arrange this in a slightly odd fashion but it lines up with
++    // how we are going to use it in the actual deblock code & it is easier
++    // to do the contortions here than there
++    //
++    // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
++
++    av_assert2((x & 7) == 0);
++
++    if ((bsf &= mask) != 0)
++    {
++        uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
++        const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
++
++        if (mask <= 0xf)
++        {
++            *p |= (bsf << sh);
++        }
++        else
++        {
++            do {
++                *p |= (bsf & 0xf) << sh;
++                p += HEVC_RPI_BS_STRIDE1_BYTES;
++            } while ((bsf >>= 4) != 0);
++        }
++    }
++}
++
++static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
++                              const unsigned int rep, const unsigned int dup,
++                              const unsigned int mvf_stride0,
++                              const unsigned int mvf_stride1,
++                              const RefPicList * const rpl_p, const RefPicList * const rpl_q,
++                              const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
++{
++    return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
++            mvf_p, mvf_q,
++            rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
++            sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
++}
++
++
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
++                                               const HEVCRpiLocalContext * const lc,
++                                               const unsigned int x0, const unsigned int y0,
++                                               const unsigned int log2_trafo_size,
++                                               const int is_coded_block)
++{
++    const HEVCRpiMvField * const mvf_curr      = mvf_stash_ptr(s, lc, x0, y0);
++    const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
++    const RefPicList * const rpl        = s->refPicList;
++    // Rep count for bsf_mv when running with min_pu chuncks
++    const unsigned int log2_rep_min_pu  = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
++    const unsigned int boundary_flags   = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
++    const unsigned int trafo_size       = (1U << log2_trafo_size);
++    const uint32_t bsf_mask             = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
++    const uint32_t bsf_cbf              = (bsf_mask & 0x55555555);
++
++    // Do we cover a pred split line?
++    const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
++    const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
++
++    uint32_t bsf_h;
++    uint32_t bsf_v;
++
++#ifdef DISABLE_STRENGTHS
++    return;
++#endif
++
++    // We are always on a size boundary
++    av_assert2((x0 & (trafo_size - 1)) == 0);
++    av_assert2((y0 & (trafo_size - 1)) == 0);
++    // log2_trafo_size not really a transform size; we can have to deal
++    // with size 2^6 blocks
++    av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
++
++    // Retrieve and update coded (b0), intra (b1) bs flags
++    //
++    // Store on min width (rather than uint32_t) to avoid possible issues
++    // with another thread on another core running wpp using the same
++    // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
++    //
++    // In bsf BS=2 is represented by 3 as it is much easier to test & set
++    // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
++    // 3 will work the same
++    {
++        // Given where we are called from is_cbf_luma & is_intra will be constant over the block
++        const uint32_t bsf0 =  (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
++        uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
++        uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
++
++        switch (log2_trafo_size)
++        {
++            case 2:
++            case 3:
++            {
++                const unsigned int sh_h = (x0 >> 1) & 7;
++                const unsigned int sh_v = (y0 >> 1) & 7;
++                bsf_h = *p;
++                bsf_v = *q;
++                *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
++                *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
++                bsf_h >>= sh_h;
++                bsf_v >>= sh_v;
++                break;
++            }
++            case 4:
++                bsf_h = *p;
++                bsf_v = *q;
++                *p = bsf0;
++                *q = bsf0;
++                break;
++            case 5:
++                bsf_h = *(uint16_t *)p;
++                bsf_v = *(uint16_t *)q;
++                *(uint16_t *)p = bsf0;
++                *(uint16_t *)q = bsf0;
++                break;
++            case 6:
++            default:
++                bsf_h = *(uint32_t *)p;
++                bsf_v = *(uint32_t *)q;
++                *(uint32_t *)p = bsf0;
++                *(uint32_t *)q = bsf0;
++                break;
++        }
++
++        bsf_h |= bsf0;
++        bsf_v |= bsf0;
++    }
++
++    // Do Horizontal
++    if ((y0 & 7) == 0)
++    {
++        // Boundary upper
++        if (y0 != 0 &&
++            (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
++             (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
++        {
++            // Look at MVs (BS=1) if we don't already has a full set of bs bits
++            if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
++            {
++                // If we aren't on the top boundary we must be in the middle
++                // and in that case we know where mvf can change
++                const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
++                const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
++                      s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
++                      rpl;
++
++                bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                    trafo_size >> (log2_min_pu_size + log2_rep),
++                    trafo_size >> (log2_min_pu_size + log2_rep),
++                    rpl, rpl_top,
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
++            }
++
++            // Finally put the results into bs
++            hbs_set(s, x0, y0, bsf_mask, bsf_h);
++        }
++
++        // Max of 1 pu internal split - ignore if not on 8pel boundary
++        if (has_y_split && !off_boundary(lc->cu.y_split, 3))
++        {
++            const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
++            // If we have the x split as well then it must be in the middle
++            const unsigned int log2_rep = has_x_split ? 1 : 0;
++
++            hbs_set(s, x0, lc->cu.y_split, bsf_mask,
++                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                   trafo_size >> (log2_min_pu_size + log2_rep),
++                   trafo_size >> (log2_min_pu_size + log2_rep),
++                   rpl, rpl,
++                   mvf, mvf - MVF_STASH_WIDTH_PU));
++        }
++    }
++
++    // And again for vertical - same logic as horizontal just in the other direction
++    if ((x0 & 7) == 0)
++    {
++        // Boundary left
++        if (x0 != 0 &&
++            (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
++             (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
++        {
++            if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
++            {
++                const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
++                const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
++                    s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
++                    rpl;
++
++                bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                    (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                    rpl, rpl_left,
++                    mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
++            }
++
++            vbs_set(s, x0, y0, bsf_mask, bsf_v);
++        }
++
++        if (has_x_split && !off_boundary(lc->cu.x_split, 3))
++        {
++            const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
++            const unsigned int log2_rep = has_y_split ? 1 : 0;
++
++            vbs_set(s, lc->cu.x_split, y0, bsf_mask,
++                bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
++                   rpl, rpl,
++                   mvf, mvf - 1));
++        }
++    }
++}
++
++#undef LUMA
++#undef CB
++#undef CR
++
++static inline unsigned int ussub(const unsigned int a, const unsigned int b)
++{
++    return a < b ? 0 : a - b;
++}
++
++static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
++{
++    return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
++}
++
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
++{
++    const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
++    int x, y;
++
++    const unsigned int br = bounds.x + bounds.w;
++    const unsigned int bb = bounds.y + bounds.h;
++
++    const int x_end = (br >= s->ps.sps->width);
++    const int y_end = (bb >= s->ps.sps->height);
++
++    // Deblock may not touch the edges of the bound as they are still needed
++    // for Intra pred
++    //
++    // Deblock is disabled with a per-slice flag
++    // Given that bounds may cover multiple slices & we dblock outside bounds
++    // anyway we can't avoid deblock using that flag - about the only thing we
++    // could do is have a "no deblock seen yet" flag but it doesn't really
++    // seem worth the effort
++
++    deblock_y_blk(s, bounds, x_end, y_end);
++    deblock_uv_blk(s, bounds, x_end, y_end);
++
++    // SAO needs
++    // (a) CTB alignment
++    // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
++    {
++        const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
++        const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
++        const unsigned int yt = ussub(bounds.y, yo);
++        const unsigned int yb = y_end ? bb : ussub(bb, yo);
++        const unsigned int xl = ussub(bounds.x, xo);
++        const unsigned int xr = x_end ? br : ussub(br, xo);
++
++        if (s->ps.sps->sao_enabled)
++        {
++            for (y = yt; y < yb; y += ctb_size) {
++                for (x = xl; x < xr; x += ctb_size) {
++                    sao_filter_CTB(s, x, y);
++                }
++            }
++        }
++
++        // Cache invalidate
++        y = 0;
++        if (xr != 0 && yb != 0)
++        {
++            const unsigned int llen =
++                (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
++            const unsigned int mask = ~(llen - 1);
++            const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
++            const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
++            const unsigned int it = ussub(yt, 1);
++            const unsigned int ib = y_end ? bb : yb - 1;
++
++            if (il < ir) {
++                rpi_cache_buf_t cbuf;
++                rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
++                rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++                  il, it, ir - il, ib - it,
++                  ctx_vshift(s, 1), 1, 1);
++
++                // If we have to commit the right hand tile boundry due to
++                // cache boundry considerations then at EoTile we must commit
++                // that boundry to bottom of tile (bounds)
++                if (ib != bb && ir == br && eot) {
++                    rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
++                      br - 1, ib, 1, bb - ib,
++                      ctx_vshift(s, 1), 1, 1);
++                }
++
++                rpi_cache_flush_finish(rfe);
++
++                if (x_end)
++                    y = y_end ? INT_MAX : ib;
++
++//                printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
++            }
++        }
++    }
++
++    return y;
++}
++
+diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
+new file mode 100644
+index 0000000000..6b36f5e737
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mv.h
+@@ -0,0 +1,71 @@
++#ifndef AVCODEC_RPI_HEVC_MV_H
++#define AVCODEC_RPI_HEVC_MV_H
++
++#include "config.h"
++
++typedef int32_t MvXY;
++
++typedef struct HEVCRpiMvField {
++    MvXY xy[2];
++    int8_t ref_idx[2];
++    int8_t pred_flag;
++    int8_t dummy; // To 12 bytes
++} HEVCRpiMvField;
++
++
++#define MV_X(xy) (((xy) << 16) >> 16)
++#define MV_Y(xy) ((xy) >> 16)
++#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_mv_arm.h"
++#endif
++
++#ifndef mvxy_add
++static inline MvXY mvxy_add(const MvXY a, const MvXY b)
++{
++    return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
++}
++#endif
++
++
++#ifndef mv_scale_xy
++static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
++{
++    int tx, scale_factor;
++
++    td = td == 0 ? 1 : av_clip_int8(td);
++    tb = av_clip_int8(tb);
++    tx = (0x4000 + (abs(td) >> 1)) / td;
++    scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
++    return MV_XY(
++        av_clip_int16((scale_factor * MV_X(src) + 127 +
++                           (scale_factor * MV_X(src) < 0)) >> 8),
++        av_clip_int16((scale_factor * MV_Y(src) + 127 +
++                           (scale_factor * MV_Y(src) < 0)) >> 8));
++}
++#endif
++
++// 8.3.1 states that the bitstream may not contain poc diffs that do not
++// fit in 16 bits, so given that we don't care about the high bits we only
++// store the low 16 + LT & Inter flags
++
++#define COL_POC_INTRA   0
++#define COL_POC_INTER   (1 << 16)
++#define COL_POC_LT      (1 << 17)
++#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
++#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
++#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
++
++typedef struct ColMv_s {
++    int32_t poc;
++    int32_t xy;
++} ColMv;
++
++typedef struct ColMvField_s {
++    ColMv L[2];
++} ColMvField;
++
++
++
++#endif // AVCODEC_RPI_HEVC_MV_H
+diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
+new file mode 100644
+index 0000000000..27a9f69525
+--- /dev/null
++++ b/libavcodec/rpi_hevc_mvs.c
+@@ -0,0 +1,487 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 Anand Meher Kotra
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++static av_always_inline int
++is_eq_mer(const unsigned int plevel,
++    const unsigned int xN, const unsigned int yN,
++    const unsigned int xP, const unsigned int yP)
++{
++    return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
++}
++
++// check if the mv's and refidx are the same between A and B
++static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++    return a->pred_flag == b->pred_flag &&
++        ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
++        ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
++    return 0;
++}
++
++/*
++ * 8.5.3.1.7  temporal luma motion vector prediction
++ */
++static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
++                                       const HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                                       const int nPbW, const int nPbH, const int refIdxLx,
++                                       MvXY * const mvLXCol, const int X)
++{
++    int x, y;
++    const ColMv * cmv = NULL;
++
++    HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
++    const RefPicList * const refPicList = s->refPicList + X;
++    const int cur_lt = refPicList->isLongTerm[refIdxLx];
++
++    *mvLXCol = 0;
++    // Unlikely but we might have a col_ref IDR frame!
++    if (col_ref->col_mvf == NULL)
++        return 0;
++
++    ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
++
++    //bottom right collocated motion vector
++    x = x0 + nPbW;
++    y = y0 + nPbH;
++
++    if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
++        y < s->ps.sps->height &&
++        x < s->ps.sps->width)
++    {
++        const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++            (y >> 4) * s->col_mvf_stride;
++
++        if (col->L[0].poc != COL_POC_INTRA &&
++            (col->L[1].poc == COL_POC_INTRA ||
++             (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++        {
++            cmv = col->L + 0;
++        }
++        else if (col->L[1].poc != COL_POC_INTRA)
++        {
++            cmv = col->L + 1;
++        }
++    }
++
++    // derive center collocated motion vector
++    if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
++    {
++        cmv = NULL;
++        x                  = x0 + (nPbW >> 1);
++        y                  = y0 + (nPbH >> 1);
++
++        {
++            const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
++              (y >> 4) * s->col_mvf_stride;
++
++            if (col->L[0].poc != COL_POC_INTRA &&
++              (col->L[1].poc == COL_POC_INTRA ||
++               (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
++            {
++              cmv = col->L + 0;
++            }
++            else if (col->L[1].poc != COL_POC_INTRA)
++            {
++              cmv = col->L + 1;
++            }
++        }
++    }
++
++    if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
++        return 0;
++
++    {
++        const int col_poc  = col_ref->poc;
++        const int ref_poc  = refPicList->list[refIdxLx];
++
++        *mvLXCol = (cur_lt ||
++                        cmv->poc == col_poc ||
++                        COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
++                    cmv->xy :
++                    mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
++    }
++
++    return cmv != NULL;
++}
++
++static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
++{
++    return b != NULL && compare_mv_ref_idx(a, b);
++}
++
++
++
++/*
++ * 8.5.3.1.2  Derivation process for spatial merging candidates
++ */
++static inline const HEVCRpiMvField *
++derive_spatial_merge_candidates(
++    const HEVCRpiContext * const s,
++    const HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    const unsigned int part_idx,
++    const unsigned int merge_idx,
++    HEVCRpiMvField * const mvf_t)
++{
++    const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
++    const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
++
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
++    const unsigned int part_mode = lc->cu.part_mode;
++
++    const HEVCRpiMvField * perm[4];
++    unsigned int nb_merge_cand = 0;
++
++    // singleMCLFlag => part_idx == 0 so no need to test for it
++    if ((avail & AVAIL_L) == 0 ||
++        (part_idx == 1 &&
++            ((parts_a1 >> part_mode) & 1) != 0 ||
++                is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
++        mvf_a1->pred_flag == PF_INTRA)
++    {
++        mvf_a1 = NULL;
++    }
++    else
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_a1;
++        perm[nb_merge_cand++] = mvf_a1;
++    }
++
++    if ((avail & AVAIL_U) == 0 ||
++            (part_idx == 1 &&
++               ((parts_b1 >> part_mode) & 1) != 0 ||
++                   is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
++            mvf_b1->pred_flag == PF_INTRA)
++    {
++        mvf_b1 = NULL;
++    }
++    else if (!mvf_eq(mvf_b1, mvf_a1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_b1;
++        perm[nb_merge_cand++] = mvf_b1;
++    }
++
++    // above right spatial merge candidate
++    // Never need mvf_b0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_UR) != 0 &&
++        !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
++        mvf_b0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_b0, mvf_b1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_b0;
++        perm[nb_merge_cand++] = mvf_b0;
++    }
++
++    // left bottom spatial merge candidate
++    // Never need mvf_a0 again so don't bother zeroing if navail
++    if ((avail & AVAIL_DL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
++        mvf_a0->pred_flag != PF_INTRA &&
++        !mvf_eq(mvf_a0, mvf_a1))
++    {
++        if (merge_idx == nb_merge_cand)
++            return mvf_a0;
++        perm[nb_merge_cand++] = mvf_a0;
++    }
++
++    // above left spatial merge candidate
++    if (nb_merge_cand != 4 &&
++        (avail & AVAIL_UL) != 0 &&
++        !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
++    {
++        const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++
++        if (mvf_b2->pred_flag != PF_INTRA &&
++            !mvf_eq(mvf_b2, mvf_a1) &&
++            !mvf_eq(mvf_b2, mvf_b1))
++        {
++            if (merge_idx == nb_merge_cand)
++                return mvf_b2;
++            perm[nb_merge_cand++] = mvf_b2;
++        }
++    }
++
++    // temporal motion vector candidate
++    if (s->sh.slice_temporal_mvp_enabled_flag)
++    {
++        static const HEVCRpiMvField mvf_z = {{0}};
++
++        *mvf_t = mvf_z;
++
++        if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                        0, mvf_t->xy + 0, 0))
++            mvf_t->pred_flag = PF_L0;
++
++        if (s->sh.slice_type == HEVC_SLICE_B &&
++                temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
++                                            0, mvf_t->xy + 1, 1))
++            mvf_t->pred_flag |= PF_L1;
++
++        if (mvf_t->pred_flag != 0)
++        {
++            if (merge_idx == nb_merge_cand)
++                return mvf_t;
++            perm[nb_merge_cand++] = mvf_t;
++        }
++    }
++
++    // combined bi-predictive merge candidates  (applies for B slices)
++    if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
++    {
++        unsigned int comb_idx = 0;
++        const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
++        const RefPicList * const refPicList = s->refPicList;
++
++        for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
++        {
++            static const uint8_t l0_l1_cand_idx[12][2] = {
++                { 0, 1, },
++                { 1, 0, },
++                { 0, 2, },
++                { 2, 0, },
++                { 1, 2, },
++                { 2, 1, },
++                { 0, 3, },
++                { 3, 0, },
++                { 1, 3, },
++                { 3, 1, },
++                { 2, 3, },
++                { 3, 2, },
++            };
++
++            const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
++            const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
++            const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
++            const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
++
++            if ((mvf_c0->pred_flag & PF_L0) != 0 &&
++                (mvf_c1->pred_flag & PF_L1) != 0 &&
++                (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
++                 mvf_c0->xy[0] != mvf_c1->xy[1]))
++            {
++                if (merge_idx == nb_merge_cand++)
++                {
++                    // Need to be a bit careful as we will construct mvf_t and we
++                    // may already be using that as one of our condidates
++                    // so build & copy rather than build in place
++                    const HEVCRpiMvField mvf_m = {
++                        .xy = {
++                            mvf_c0->xy[0],
++                            mvf_c1->xy[1]},
++                        .ref_idx = {
++                            mvf_c0->ref_idx[0],
++                            mvf_c1->ref_idx[1]},
++                        .pred_flag = PF_BI
++                    };
++                    *mvf_t = mvf_m;
++                    return mvf_t;
++                }
++            }
++        }
++    }
++
++    // "append" Zero motion vector candidates
++    {
++        const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
++                            FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
++        const unsigned int zero_idx = merge_idx - nb_merge_cand;
++
++        const HEVCRpiMvField mvf_m = {
++            .xy = {0, 0},
++            .ref_idx = {
++                zero_idx < nb_refs ? zero_idx : 0,
++                (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
++            .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
++        };
++
++        *mvf_t = mvf_m;
++        return mvf_t;
++    }
++}
++
++
++// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
++                                int merge_idx, HEVCRpiMvField * const mv)
++{
++    const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
++        derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
++                                        0, merge_idx, mv) :
++        derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
++                                        ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
++                                        part_idx, merge_idx, mv);
++
++    if (mvf_m != mv)
++        *mv = *mvf_m;
++
++    if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
++        mv->pred_flag = PF_L0;
++}
++
++
++static av_always_inline const MvXY *
++mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
++{
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
++            return mvf->xy + pfi0;
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
++            return mvf->xy + pfi1;
++    }
++    return NULL;
++}
++
++static av_always_inline const MvXY *
++mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
++              const int islt0, const int poc0, const int poc_cur,
++              MvXY * const mv_t, const HEVCRpiMvField * const mvf)
++{
++    if (mvf != NULL)
++    {
++        if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
++        {
++            const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi0;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
++        }
++        if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
++        {
++            const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
++            if (islt0 || poc1 == poc0) {
++                return mvf->xy + pfi1;
++            }
++            *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
++            return mv_t;
++        }
++    }
++    return NULL;
++}
++
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX)
++{
++    const unsigned int pfi0 = LX;
++    const unsigned int pfi1 = LX == 0 ? 1 : 0;
++    const RefPicList * const rpl = s->refPicList;
++    const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
++    const int poc_cur = s->poc;
++    const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
++
++    const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
++    const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
++    const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1);  // UL
++    const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
++    const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
++    const MvXY * mva = NULL;
++    const MvXY * mvb;
++    MvXY * const mv_rv = mv->xy + LX;
++    MvXY mvt_a, mvt_b;
++
++    *mv_rv = 0;
++
++    if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
++        mvf_a0 = NULL;
++    else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
++        goto use_mva;
++
++    if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
++        mvf_a1 = NULL;
++
++    if (mva == NULL &&
++        (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
++        (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
++        mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
++
++    if (mvp_lx_flag == 0 && mva != NULL)
++        goto use_mva;
++
++    if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
++        mvf_b0 = NULL;
++    if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
++        mvf_b1 = NULL;
++    if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
++        mvf_b2 = NULL;
++
++    if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
++        (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
++        mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
++
++    if (mvf_a0 == NULL && mvf_a1 == NULL) {
++        mva = mvb;
++        if (mvp_lx_flag == 0 && mva != NULL)
++            goto use_mva;
++
++        if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
++            (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
++            mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
++    }
++
++    if (mva == NULL) {
++        mva = mvb;
++        mvb = NULL;
++    }
++
++    if (mvb != NULL && *mva == *mvb)  // If A == B then ignore B
++        mvb = NULL;
++
++    if (mvp_lx_flag == 0 && mva != NULL) {
++        goto use_mva;
++    }
++    else if (mvp_lx_flag != 0 && mvb != NULL) {
++        *mv_rv = *mvb;
++    }
++    else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
++        temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
++                                    nPbH, mv->ref_idx[LX],
++                                    mv_rv, LX);
++    }
++    return;
++
++use_mva:
++    *mv_rv = *mva;
++    return;
++}
++
+diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
+new file mode 100644
+index 0000000000..e58a59ce5e
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.c
+@@ -0,0 +1,143 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "bytestream.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_parse.h"
++
++static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
++                                 HEVCSEIContext *sei, int is_nalff, int nal_length_size,
++                                 int err_recognition, int apply_defdispwin, void *logctx)
++{
++    int i;
++    int ret = 0;
++    H2645Packet pkt = { 0 };
++
++    ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff,
++                                nal_length_size, AV_CODEC_ID_HEVC, 1, 0);
++    if (ret < 0) {
++        goto done;
++    }
++
++    for (i = 0; i < pkt.nb_nals; i++) {
++        H2645NAL *nal = &pkt.nals[i];
++
++        /* ignore everything except parameter sets and VCL NALUs */
++        switch (nal->type) {
++        case HEVC_NAL_VPS:
++            ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SPS:
++            ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_PPS:
++            ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
++            if (ret < 0)
++                goto done;
++            break;
++        case HEVC_NAL_SEI_PREFIX:
++        case HEVC_NAL_SEI_SUFFIX:
++            ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
++            if (ret < 0)
++                goto done;
++            break;
++        default:
++            av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
++            break;
++        }
++    }
++
++done:
++    ff_h2645_packet_uninit(&pkt);
++    if (err_recognition & AV_EF_EXPLODE)
++        return ret;
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx)
++{
++    int ret = 0;
++    GetByteContext gb;
++
++    bytestream2_init(&gb, data, size);
++
++    if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
++        /* It seems the extradata is encoded as hvcC format.
++         * Temporarily, we support configurationVersion==0 until 14496-15 3rd
++         * is finalized. When finalized, configurationVersion will be 1 and we
++         * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
++        int i, j, num_arrays, nal_len_size;
++
++        *is_nalff = 1;
++
++        bytestream2_skip(&gb, 21);
++        nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
++        num_arrays   = bytestream2_get_byte(&gb);
++
++        /* nal units in the hvcC always have length coded with 2 bytes,
++         * so put a fake nal_length_size = 2 while parsing them */
++        *nal_length_size = 2;
++
++        /* Decode nal units from hvcC. */
++        for (i = 0; i < num_arrays; i++) {
++            int type = bytestream2_get_byte(&gb) & 0x3f;
++            int cnt  = bytestream2_get_be16(&gb);
++
++            for (j = 0; j < cnt; j++) {
++                // +2 for the nal size field
++                int nalsize = bytestream2_peek_be16(&gb) + 2;
++                if (bytestream2_get_bytes_left(&gb) < nalsize) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Invalid NAL unit size in extradata.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
++                                            *nal_length_size, err_recognition, apply_defdispwin,
++                                            logctx);
++                if (ret < 0) {
++                    av_log(logctx, AV_LOG_ERROR,
++                           "Decoding nal unit %d %d from hvcC failed\n",
++                           type, i);
++                    return ret;
++                }
++                bytestream2_skip(&gb, nalsize);
++            }
++        }
++
++        /* Now store right nal length size, that will be used to parse
++         * all other nals */
++        *nal_length_size = nal_len_size;
++    } else {
++        *is_nalff = 0;
++        ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
++                                    err_recognition, apply_defdispwin, logctx);
++        if (ret < 0)
++            return ret;
++    }
++
++    return ret;
++}
+diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
+new file mode 100644
+index 0000000000..4b4d032a16
+--- /dev/null
++++ b/libavcodec/rpi_hevc_parse.h
+@@ -0,0 +1,36 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * H.265 parser code
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PARSE_H
++#define AVCODEC_RPI_HEVC_PARSE_H
++
++#include <stdint.h>
++
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
++                             HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
++                             int err_recognition, int apply_defdispwin, void *logctx);
++
++#endif /* AVCODEC_RPI_HEVC_PARSE_H */
+diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
+new file mode 100644
+index 0000000000..f4e31f7d1d
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.c
+@@ -0,0 +1,1938 @@
++/*
++ * HEVC Parameter Set decoding
++ *
++ * Copyright (C) 2012 - 2103 Guillaume Martres
++ * Copyright (C) 2012 - 2103 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/imgutils.h"
++#include "golomb.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevcdec.h"
++
++static const uint8_t default_scaling_list_intra[] = {
++    16, 16, 16, 16, 17, 18, 21, 24,
++    16, 16, 16, 16, 17, 19, 22, 25,
++    16, 16, 17, 18, 20, 22, 25, 29,
++    16, 16, 18, 21, 24, 27, 31, 36,
++    17, 17, 20, 24, 30, 35, 41, 47,
++    18, 19, 22, 27, 35, 44, 54, 65,
++    21, 22, 25, 31, 41, 54, 70, 88,
++    24, 25, 29, 36, 47, 65, 88, 115
++};
++
++static const uint8_t default_scaling_list_inter[] = {
++    16, 16, 16, 16, 17, 18, 20, 24,
++    16, 16, 16, 17, 18, 20, 24, 25,
++    16, 16, 17, 18, 20, 24, 25, 28,
++    16, 17, 18, 20, 24, 25, 28, 33,
++    17, 18, 20, 24, 25, 28, 33, 41,
++    18, 20, 24, 25, 28, 33, 41, 54,
++    20, 24, 25, 28, 33, 41, 54, 71,
++    24, 25, 28, 33, 41, 54, 71, 91
++};
++
++static const AVRational vui_sar[] = {
++    {  0,   1 },
++    {  1,   1 },
++    { 12,  11 },
++    { 10,  11 },
++    { 16,  11 },
++    { 40,  33 },
++    { 24,  11 },
++    { 20,  11 },
++    { 32,  11 },
++    { 80,  33 },
++    { 18,  11 },
++    { 15,  11 },
++    { 64,  33 },
++    { 160, 99 },
++    {  4,   3 },
++    {  3,   2 },
++    {  2,   1 },
++};
++
++
++// pps_cb_qp_offset: -12,+12
++// slice_cb_qp_offset: -12,+12 also
++//   "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
++// cr_qp_offset_list[n]: -12,+12
++// So worst case total offset: -24,+24
++
++#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
++#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
++#define M(B,n) C(B,(-n))
++
++// Sizeof the QP_START_BLOCK
++#define QP_OFFSET_0 (8*6 + 12*2)
++#define QP_START(B) \
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++    M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
++\
++    M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
++    M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
++    M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
++    M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
++    M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
++    M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
++    M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
++    M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
++#define QP_END(B) \
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
++
++#define T1(B)\
++{\
++    QP_START(B),\
++    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++    C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
++    C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
++    C(B,44), C(B,45),\
++    C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
++    QP_END(B)\
++}
++#define T0(B)\
++{\
++    QP_START(B),\
++    C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
++    C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
++    C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
++    C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
++    C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
++    C(B,50), C(B,51),\
++    C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
++    QP_END(B)\
++}
++
++#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
++
++static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
++static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
++
++#undef T
++#undef C
++#undef QP_END
++
++#define C(B,n) ((n)<0?0:(n)>51?51:(n))
++// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
++#define QP_DBLK_OFFSET_0 QP_OFFSET_0
++#define QP_END(B)\
++ 51, 51, 51, 51, 51, 51
++
++// These don't need all the padding we have here (12 top/bottom would be enough)
++static const uint8_t qp_c_dblk_0[] = T0(0);
++static const uint8_t qp_c_dblk_1[] = T1(0);
++
++#undef T
++#undef M
++#undef C
++#undef QP_END
++#undef QP_START
++
++
++static void remove_pps(HEVCRpiParamSets * const s, const int id)
++{
++    if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
++        s->pps = NULL;
++    av_buffer_unref(&s->pps_list[id]);
++}
++
++static void remove_sps(HEVCRpiParamSets * const s, const int id)
++{
++    int i;
++    if (s->sps_list[id]) {
++        if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
++            s->sps = NULL;
++
++        /* drop all PPS that depend on this SPS */
++        for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
++            if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
++                remove_pps(s, i);
++
++        av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
++    }
++    av_buffer_unref(&s->sps_list[id]);
++}
++
++static void remove_vps(HEVCRpiParamSets * const s, const int id)
++{
++    int i;
++    if (s->vps_list[id]) {
++        if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
++            s->vps = NULL;
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
++            if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
++                remove_sps(s, i);
++    }
++    av_buffer_unref(&s->vps_list[id]);
++}
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
++                                  ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
++{
++    uint8_t rps_predict = 0;
++    int delta_poc;
++    int k0 = 0;
++    int k1 = 0;
++    int k  = 0;
++    int i;
++
++    if (rps != sps->st_rps && sps->nb_st_rps)
++        rps_predict = get_bits1(gb);
++
++    if (rps_predict) {
++        const ShortTermRPS *rps_ridx;
++        int delta_rps;
++        unsigned abs_delta_rps;
++        uint8_t use_delta_flag = 0;
++        uint8_t delta_rps_sign;
++
++        if (is_slice_header) {
++            unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
++            if (delta_idx > sps->nb_st_rps) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
++                       delta_idx, sps->nb_st_rps);
++                return AVERROR_INVALIDDATA;
++            }
++            rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
++            rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
++        } else
++            rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
++
++        delta_rps_sign = get_bits1(gb);
++        abs_delta_rps  = get_ue_golomb_long(gb) + 1;
++        if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid value of abs_delta_rps: %d\n",
++                   abs_delta_rps);
++            return AVERROR_INVALIDDATA;
++        }
++        delta_rps      = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
++        for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
++            int used = rps->used[k] = get_bits1(gb);
++
++            if (!used)
++                use_delta_flag = get_bits1(gb);
++
++            if (used || use_delta_flag) {
++                if (i < rps_ridx->num_delta_pocs)
++                    delta_poc = delta_rps + rps_ridx->delta_poc[i];
++                else
++                    delta_poc = delta_rps;
++                rps->delta_poc[k] = delta_poc;
++                if (delta_poc < 0)
++                    k0++;
++                else
++                    k1++;
++                k++;
++            }
++        }
++
++        if (k >= FF_ARRAY_ELEMS(rps->used)) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "Invalid num_delta_pocs: %d\n", k);
++            return AVERROR_INVALIDDATA;
++        }
++
++        rps->num_delta_pocs    = k;
++        rps->num_negative_pics = k0;
++        // sort in increasing order (smallest first)
++        if (rps->num_delta_pocs != 0) {
++            int used, tmp;
++            for (i = 1; i < rps->num_delta_pocs; i++) {
++                delta_poc = rps->delta_poc[i];
++                used      = rps->used[i];
++                for (k = i - 1; k >= 0; k--) {
++                    tmp = rps->delta_poc[k];
++                    if (delta_poc < tmp) {
++                        rps->delta_poc[k + 1] = tmp;
++                        rps->used[k + 1]      = rps->used[k];
++                        rps->delta_poc[k]     = delta_poc;
++                        rps->used[k]          = used;
++                    }
++                }
++            }
++        }
++        if ((rps->num_negative_pics >> 1) != 0) {
++            int used;
++            k = rps->num_negative_pics - 1;
++            // flip the negative values to largest first
++            for (i = 0; i < rps->num_negative_pics >> 1; i++) {
++                delta_poc         = rps->delta_poc[i];
++                used              = rps->used[i];
++                rps->delta_poc[i] = rps->delta_poc[k];
++                rps->used[i]      = rps->used[k];
++                rps->delta_poc[k] = delta_poc;
++                rps->used[k]      = used;
++                k--;
++            }
++        }
++    } else {
++        unsigned int prev, nb_positive_pics;
++        rps->num_negative_pics = get_ue_golomb_long(gb);
++        nb_positive_pics       = get_ue_golomb_long(gb);
++
++        if (rps->num_negative_pics >= HEVC_MAX_REFS ||
++            nb_positive_pics >= HEVC_MAX_REFS) {
++            av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
++        if (rps->num_delta_pocs) {
++            prev = 0;
++            for (i = 0; i < rps->num_negative_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev -= delta_poc;
++                rps->delta_poc[i] = prev;
++                rps->used[i]      = get_bits1(gb);
++            }
++            prev = 0;
++            for (i = 0; i < nb_positive_pics; i++) {
++                delta_poc = get_ue_golomb_long(gb) + 1;
++                if (delta_poc < 1 || delta_poc > 32768) {
++                    av_log(avctx, AV_LOG_ERROR,
++                        "Invalid value of delta_poc: %d\n",
++                        delta_poc);
++                    return AVERROR_INVALIDDATA;
++                }
++                prev += delta_poc;
++                rps->delta_poc[rps->num_negative_pics + i] = prev;
++                rps->used[rps->num_negative_pics + i]      = get_bits1(gb);
++            }
++        }
++    }
++    return 0;
++}
++
++
++static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
++                                      PTLCommon * const ptl)
++{
++    int i;
++
++    if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
++        return -1;
++
++    ptl->profile_space = get_bits(gb, 2);
++    ptl->tier_flag     = get_bits1(gb);
++    ptl->profile_idc   = get_bits(gb, 5);
++    if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
++        av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
++        av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
++        av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
++    else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
++        av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
++    else
++        av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
++
++    for (i = 0; i < 32; i++) {
++        ptl->profile_compatibility_flag[i] = get_bits1(gb);
++
++        if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
++            ptl->profile_idc = i;
++    }
++    ptl->progressive_source_flag    = get_bits1(gb);
++    ptl->interlaced_source_flag     = get_bits1(gb);
++    ptl->non_packed_constraint_flag = get_bits1(gb);
++    ptl->frame_only_constraint_flag = get_bits1(gb);
++
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
++    skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
++    skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
++
++    return 0;
++}
++
++static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
++                      PTL * const ptl, const int max_num_sub_layers)
++{
++    int i;
++    if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
++        get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
++        av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
++        return -1;
++    }
++
++    ptl->general_ptl.level_idc = get_bits(gb, 8);
++
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
++        ptl->sub_layer_level_present_flag[i]   = get_bits1(gb);
++    }
++
++    if (max_num_sub_layers - 1> 0)
++        for (i = max_num_sub_layers - 1; i < 8; i++)
++            skip_bits(gb, 2); // reserved_zero_2bits[i]
++    for (i = 0; i < max_num_sub_layers - 1; i++) {
++        if (ptl->sub_layer_profile_present_flag[i] &&
++            decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PTL information for sublayer %i too short\n", i);
++            return -1;
++        }
++        if (ptl->sub_layer_level_present_flag[i]) {
++            if (get_bits_left(gb) < 8) {
++                av_log(avctx, AV_LOG_ERROR,
++                       "Not enough data for sublayer %i level_idc\n", i);
++                return -1;
++            } else
++                ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
++        }
++    }
++
++    return 0;
++}
++
++static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
++                                const int subpic_params_present)
++{
++    int i;
++
++    for (i = 0; i < nb_cpb; i++) {
++        get_ue_golomb_long(gb); // bit_rate_value_minus1
++        get_ue_golomb_long(gb); // cpb_size_value_minus1
++
++        if (subpic_params_present) {
++            get_ue_golomb_long(gb); // cpb_size_du_value_minus1
++            get_ue_golomb_long(gb); // bit_rate_du_value_minus1
++        }
++        skip_bits1(gb); // cbr_flag
++    }
++}
++
++static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
++                      const int max_sublayers)
++{
++    int nal_params_present = 0, vcl_params_present = 0;
++    int subpic_params_present = 0;
++    int i;
++
++    if (common_inf_present) {
++        nal_params_present = get_bits1(gb);
++        vcl_params_present = get_bits1(gb);
++
++        if (nal_params_present || vcl_params_present) {
++            subpic_params_present = get_bits1(gb);
++
++            if (subpic_params_present) {
++                skip_bits(gb, 8); // tick_divisor_minus2
++                skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
++                skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
++                skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
++            }
++
++            skip_bits(gb, 4); // bit_rate_scale
++            skip_bits(gb, 4); // cpb_size_scale
++
++            if (subpic_params_present)
++                skip_bits(gb, 4);  // cpb_size_du_scale
++
++            skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
++            skip_bits(gb, 5); // dpb_output_delay_length_minus1
++        }
++    }
++
++    for (i = 0; i < max_sublayers; i++) {
++        int low_delay = 0;
++        unsigned int nb_cpb = 1;
++        int fixed_rate = get_bits1(gb);
++
++        if (!fixed_rate)
++            fixed_rate = get_bits1(gb);
++
++        if (fixed_rate)
++            get_ue_golomb_long(gb);  // elemental_duration_in_tc_minus1
++        else
++            low_delay = get_bits1(gb);
++
++        if (!low_delay) {
++            nb_cpb = get_ue_golomb_long(gb) + 1;
++            if (nb_cpb < 1 || nb_cpb > 32) {
++                av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        if (nal_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++        if (vcl_params_present)
++            decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
++    }
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
++                           HEVCRpiParamSets * const ps)
++{
++    int i,j;
++    int vps_id = 0;
++    ptrdiff_t nal_size;
++    HEVCRpiVPS *vps;
++    AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
++
++    if (!vps_buf)
++        return AVERROR(ENOMEM);
++    vps = (HEVCRpiVPS*)vps_buf->data;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(vps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(vps->data));
++        vps->data_size = sizeof(vps->data);
++    } else {
++        vps->data_size = nal_size;
++    }
++    memcpy(vps->data, gb->buffer, vps->data_size);
++
++    vps_id = get_bits(gb, 4);
++    if (vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
++        goto err;
++    }
++
++    if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
++        goto err;
++    }
++
++    vps->vps_max_layers               = get_bits(gb, 6) + 1;
++    vps->vps_max_sub_layers           = get_bits(gb, 3) + 1;
++    vps->vps_temporal_id_nesting_flag = get_bits1(gb);
++
++    if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
++        av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
++        goto err;
++    }
++
++    if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
++               vps->vps_max_sub_layers);
++        goto err;
++    }
++
++    if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
++        goto err;
++
++    vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
++
++    i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
++    for (; i < vps->vps_max_sub_layers; i++) {
++        vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_reorder_pics[i]      = get_ue_golomb_long(gb);
++        vps->vps_max_latency_increase[i]  = get_ue_golomb_long(gb) - 1;
++
++        if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
++            av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   vps->vps_max_dec_pic_buffering[i] - 1);
++            goto err;
++        }
++        if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
++            av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
++                   vps->vps_num_reorder_pics[i]);
++            if (avctx->err_recognition & AV_EF_EXPLODE)
++                goto err;
++        }
++    }
++
++    vps->vps_max_layer_id   = get_bits(gb, 6);
++    vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
++    if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
++        (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
++        av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
++        goto err;
++    }
++
++    for (i = 1; i < vps->vps_num_layer_sets; i++)
++        for (j = 0; j <= vps->vps_max_layer_id; j++)
++            skip_bits(gb, 1);  // layer_id_included_flag[i][j]
++
++    vps->vps_timing_info_present_flag = get_bits1(gb);
++    if (vps->vps_timing_info_present_flag) {
++        vps->vps_num_units_in_tick               = get_bits_long(gb, 32);
++        vps->vps_time_scale                      = get_bits_long(gb, 32);
++        vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vps->vps_poc_proportional_to_timing_flag)
++            vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
++        vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
++        if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
++            goto err;
++        }
++        for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
++            int common_inf_present = 1;
++
++            get_ue_golomb_long(gb); // hrd_layer_set_idx
++            if (i)
++                common_inf_present = get_bits1(gb);
++            decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
++        }
++    }
++    get_bits1(gb); /* vps_extension_flag */
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread VPS by %d bits\n", -get_bits_left(gb));
++        if (ps->vps_list[vps_id])
++            goto err;
++    }
++
++    if (ps->vps_list[vps_id] &&
++        !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
++        av_buffer_unref(&vps_buf);
++    } else {
++        remove_vps(ps, vps_id);
++        ps->vps_list[vps_id] = vps_buf;
++    }
++
++    return 0;
++
++err:
++    av_buffer_unref(&vps_buf);
++    return AVERROR_INVALIDDATA;
++}
++
++static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
++                       const int apply_defdispwin, HEVCRpiSPS * const sps)
++{
++    VUI backup_vui, * const vui = &sps->vui;
++    GetBitContext backup;
++    int sar_present, alt = 0;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
++
++    sar_present = get_bits1(gb);
++    if (sar_present) {
++        uint8_t sar_idx = get_bits(gb, 8);
++        if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
++            vui->sar = vui_sar[sar_idx];
++        else if (sar_idx == 255) {
++            vui->sar.num = get_bits(gb, 16);
++            vui->sar.den = get_bits(gb, 16);
++        } else
++            av_log(avctx, AV_LOG_WARNING,
++                   "Unknown SAR index: %u.\n", sar_idx);
++    }
++
++    vui->overscan_info_present_flag = get_bits1(gb);
++    if (vui->overscan_info_present_flag)
++        vui->overscan_appropriate_flag = get_bits1(gb);
++
++    vui->video_signal_type_present_flag = get_bits1(gb);
++    if (vui->video_signal_type_present_flag) {
++        vui->video_format                    = get_bits(gb, 3);
++        vui->video_full_range_flag           = get_bits1(gb);
++        vui->colour_description_present_flag = get_bits1(gb);
++        if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
++            sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
++        if (vui->colour_description_present_flag) {
++            vui->colour_primaries        = get_bits(gb, 8);
++            vui->transfer_characteristic = get_bits(gb, 8);
++            vui->matrix_coeffs           = get_bits(gb, 8);
++
++            // Set invalid values to "unspecified"
++            if (!av_color_primaries_name(vui->colour_primaries))
++                vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
++            if (!av_color_transfer_name(vui->transfer_characteristic))
++                vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
++            if (!av_color_space_name(vui->matrix_coeffs))
++                vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
++            if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
++                switch (sps->pix_fmt) {
++                case AV_PIX_FMT_YUV444P:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP;
++                    break;
++                case AV_PIX_FMT_YUV444P10:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP10;
++                    break;
++                case AV_PIX_FMT_YUV444P12:
++                    sps->pix_fmt = AV_PIX_FMT_GBRP12;
++                    break;
++                }
++            }
++        }
++    }
++
++    vui->chroma_loc_info_present_flag = get_bits1(gb);
++    if (vui->chroma_loc_info_present_flag) {
++        vui->chroma_sample_loc_type_top_field    = get_ue_golomb_long(gb);
++        vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
++    }
++
++    vui->neutra_chroma_indication_flag = get_bits1(gb);
++    vui->field_seq_flag                = get_bits1(gb);
++    vui->frame_field_info_present_flag = get_bits1(gb);
++
++    // Backup context in case an alternate header is detected
++    memcpy(&backup, gb, sizeof(backup));
++    memcpy(&backup_vui, vui, sizeof(backup_vui));
++    if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
++        vui->default_display_window_flag = 0;
++        av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
++    } else
++        vui->default_display_window_flag = get_bits1(gb);
++
++    if (vui->default_display_window_flag) {
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        vui->def_disp_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        vui->def_disp_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (apply_defdispwin &&
++            avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding vui default display window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   vui->def_disp_win.left_offset,
++                   vui->def_disp_win.right_offset,
++                   vui->def_disp_win.top_offset,
++                   vui->def_disp_win.bottom_offset);
++
++            vui->def_disp_win.left_offset   =
++            vui->def_disp_win.right_offset  =
++            vui->def_disp_win.top_offset    =
++            vui->def_disp_win.bottom_offset = 0;
++        }
++    }
++
++timing_info:
++    vui->vui_timing_info_present_flag = get_bits1(gb);
++
++    if (vui->vui_timing_info_present_flag) {
++        if( get_bits_left(gb) < 66 && !alt) {
++            // The alternate syntax seem to have timing info located
++            // at where def_disp_win is normally located
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI timing information, retrying...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->vui_num_units_in_tick               = get_bits_long(gb, 32);
++        vui->vui_time_scale                      = get_bits_long(gb, 32);
++        if (alt) {
++            av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
++                   vui->vui_time_scale, vui->vui_num_units_in_tick);
++        }
++        vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
++        if (vui->vui_poc_proportional_to_timing_flag)
++            vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
++        vui->vui_hrd_parameters_present_flag = get_bits1(gb);
++        if (vui->vui_hrd_parameters_present_flag)
++            decode_hrd(gb, 1, sps->max_sub_layers);
++    }
++
++    vui->bitstream_restriction_flag = get_bits1(gb);
++    if (vui->bitstream_restriction_flag) {
++        if (get_bits_left(gb) < 8 && !alt) {
++            av_log(avctx, AV_LOG_WARNING,
++                   "Strange VUI bitstream restriction information, retrying"
++                   " from timing information...\n");
++            memcpy(vui, &backup_vui, sizeof(backup_vui));
++            memcpy(gb, &backup, sizeof(backup));
++            alt = 1;
++            goto timing_info;
++        }
++        vui->tiles_fixed_structure_flag              = get_bits1(gb);
++        vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
++        vui->restricted_ref_pic_lists_flag           = get_bits1(gb);
++        vui->min_spatial_segmentation_idc            = get_ue_golomb_long(gb);
++        vui->max_bytes_per_pic_denom                 = get_ue_golomb_long(gb);
++        vui->max_bits_per_min_cu_denom               = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_horizontal           = get_ue_golomb_long(gb);
++        vui->log2_max_mv_length_vertical             = get_ue_golomb_long(gb);
++    }
++
++    if (get_bits_left(gb) < 1 && !alt) {
++        // XXX: Alternate syntax when sps_range_extension_flag != 0?
++        av_log(avctx, AV_LOG_WARNING,
++               "Overread in VUI, retrying from timing information...\n");
++        memcpy(vui, &backup_vui, sizeof(backup_vui));
++        memcpy(gb, &backup, sizeof(backup));
++        alt = 1;
++        goto timing_info;
++    }
++}
++
++static void set_default_scaling_list_data(ScalingList * const sl)
++{
++    int matrixId;
++
++    for (matrixId = 0; matrixId < 6; matrixId++) {
++        // 4x4 default is 16
++        memset(sl->sl[0][matrixId], 16, 16);
++        sl->sl_dc[0][matrixId] = 16; // default for 16x16
++        sl->sl_dc[1][matrixId] = 16; // default for 32x32
++    }
++
++    memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
++
++    memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
++
++    memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
++    memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
++
++    memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
++    memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
++}
++
++static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
++                             const HEVCRpiSPS * const sps)
++{
++    uint8_t scaling_list_pred_mode_flag;
++    int32_t scaling_list_dc_coef[2][6];
++    int size_id, matrix_id, pos;
++    int i;
++
++    for (size_id = 0; size_id < 4; size_id++)
++        for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
++            scaling_list_pred_mode_flag = get_bits1(gb);
++            if (!scaling_list_pred_mode_flag) {
++                unsigned int delta = get_ue_golomb_long(gb);
++                /* Only need to handle non-zero delta. Zero means default,
++                 * which should already be in the arrays. */
++                if (delta) {
++                    // Copy from previous array.
++                    delta *= (size_id == 3) ? 3 : 1;
++                    if (matrix_id < delta) {
++                        av_log(avctx, AV_LOG_ERROR,
++                               "Invalid delta in scaling list data: %d.\n", delta);
++                        return AVERROR_INVALIDDATA;
++                    }
++
++                    memcpy(sl->sl[size_id][matrix_id],
++                           sl->sl[size_id][matrix_id - delta],
++                           size_id > 0 ? 64 : 16);
++                    if (size_id > 1)
++                        sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
++                }
++            } else {
++                int next_coef, coef_num;
++                int32_t scaling_list_delta_coef;
++
++                next_coef = 8;
++                coef_num  = FFMIN(64, 1 << (4 + (size_id << 1)));
++                if (size_id > 1) {
++                    scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
++                    next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
++                    sl->sl_dc[size_id - 2][matrix_id] = next_coef;
++                }
++                for (i = 0; i < coef_num; i++) {
++                    if (size_id == 0)
++                        pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
++                                  ff_hevc_rpi_diag_scan4x4_x[i];
++                    else
++                        pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
++                                  ff_hevc_rpi_diag_scan8x8_x[i];
++
++                    scaling_list_delta_coef = get_se_golomb(gb);
++                    next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
++                    sl->sl[size_id][matrix_id][pos] = next_coef;
++                }
++            }
++        }
++
++    if (sps->chroma_format_idc == 3) {
++        for (i = 0; i < 64; i++) {
++            sl->sl[3][1][i] = sl->sl[2][1][i];
++            sl->sl[3][2][i] = sl->sl[2][2][i];
++            sl->sl[3][4][i] = sl->sl[2][4][i];
++            sl->sl[3][5][i] = sl->sl[2][5][i];
++        }
++        sl->sl_dc[1][1] = sl->sl_dc[0][1];
++        sl->sl_dc[1][2] = sl->sl_dc[0][2];
++        sl->sl_dc[1][4] = sl->sl_dc[0][4];
++        sl->sl_dc[1][5] = sl->sl_dc[0][5];
++    }
++
++
++    return 0;
++}
++
++static int map_pixel_format(HEVCRpiSPS * const sps)
++{
++    const int cfmt = sps->chroma_format_idc;
++
++    sps->pix_fmt = AV_PIX_FMT_NONE;
++    switch (sps->bit_depth) {
++    case 8:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND128;
++        break;
++    case 10:
++        if (cfmt == 1)
++            sps->pix_fmt = AV_PIX_FMT_SAND64_10;
++        break;
++    default:
++        break;
++    }
++
++    sps->hshift[0] = sps->vshift[0] = 0;
++    sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
++    sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
++
++    sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
++
++    return 0;
++}
++
++static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
++                      const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
++{
++    HEVCRpiWindow *ow;
++    int ret = 0;
++    int log2_diff_max_min_transform_block_size;
++    int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
++    int i;
++
++    // Coded parameters
++
++    sps->vps_id = get_bits(gb, 4);
++    if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (vps_list && !vps_list[sps->vps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
++               sps->vps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->max_sub_layers = get_bits(gb, 3) + 1;
++    if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
++        av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
++               sps->max_sub_layers);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->temporal_id_nesting_flag = get_bits(gb, 1);
++
++    if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
++        return ret;
++
++    *sps_id = get_ue_golomb_long(gb);
++    if (*sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sps->chroma_format_idc = get_ue_golomb_long(gb);
++    if (sps->chroma_format_idc > 3U) {
++        av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->chroma_format_idc == 3)
++        sps->separate_colour_plane_flag = get_bits1(gb);
++
++    if (sps->separate_colour_plane_flag)
++        sps->chroma_format_idc = 0;
++
++    sps->width  = get_ue_golomb_long(gb);
++    sps->height = get_ue_golomb_long(gb);
++    if ((ret = av_image_check_size(sps->width,
++                                   sps->height, 0, avctx)) < 0)
++        return ret;
++
++    if (get_bits1(gb)) { // pic_conformance_flag
++        int vert_mult  = 1 + (sps->chroma_format_idc < 2);
++        int horiz_mult = 1 + (sps->chroma_format_idc < 3);
++        sps->pic_conf_win.left_offset   = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.right_offset  = get_ue_golomb_long(gb) * horiz_mult;
++        sps->pic_conf_win.top_offset    = get_ue_golomb_long(gb) *  vert_mult;
++        sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) *  vert_mult;
++
++        if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
++            av_log(avctx, AV_LOG_DEBUG,
++                   "discarding sps conformance window, "
++                   "original values are l:%u r:%u t:%u b:%u\n",
++                   sps->pic_conf_win.left_offset,
++                   sps->pic_conf_win.right_offset,
++                   sps->pic_conf_win.top_offset,
++                   sps->pic_conf_win.bottom_offset);
++
++            sps->pic_conf_win.left_offset   =
++            sps->pic_conf_win.right_offset  =
++            sps->pic_conf_win.top_offset    =
++            sps->pic_conf_win.bottom_offset = 0;
++        }
++        sps->output_window = sps->pic_conf_win;
++    }
++
++    sps->bit_depth   = get_ue_golomb_long(gb) + 8;
++    bit_depth_chroma = get_ue_golomb_long(gb) + 8;
++    if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Luma bit depth (%d) is different from chroma bit depth (%d), "
++               "this is unsupported.\n",
++               sps->bit_depth, bit_depth_chroma);
++        return AVERROR_INVALIDDATA;
++    }
++
++    ret = map_pixel_format(sps);
++    if (ret < 0)
++        return ret;
++
++    sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
++    if (sps->log2_max_poc_lsb > 16) {
++        av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
++               sps->log2_max_poc_lsb - 4);
++        return AVERROR_INVALIDDATA;
++    }
++
++    sublayer_ordering_info = get_bits1(gb);
++    start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
++    for (i = start; i < sps->max_sub_layers; i++) {
++        sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
++        sps->temporal_layer[i].num_reorder_pics      = get_ue_golomb_long(gb);
++        sps->temporal_layer[i].max_latency_increase  = get_ue_golomb_long(gb) - 1;
++        if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
++            av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
++                   sps->temporal_layer[i].max_dec_pic_buffering - 1U);
++            return AVERROR_INVALIDDATA;
++        }
++        if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
++            av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
++                   sps->temporal_layer[i].num_reorder_pics);
++            if (avctx->err_recognition & AV_EF_EXPLODE ||
++                sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
++                return AVERROR_INVALIDDATA;
++            }
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
++        }
++    }
++
++    if (!sublayer_ordering_info) {
++        for (i = 0; i < start; i++) {
++            sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
++            sps->temporal_layer[i].num_reorder_pics      = sps->temporal_layer[start].num_reorder_pics;
++            sps->temporal_layer[i].max_latency_increase  = sps->temporal_layer[start].max_latency_increase;
++        }
++    }
++
++    sps->log2_min_cb_size                    = get_ue_golomb_long(gb) + 3;
++    sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
++    sps->log2_min_tb_size                    = get_ue_golomb_long(gb) + 2;
++    log2_diff_max_min_transform_block_size   = get_ue_golomb_long(gb);
++    sps->log2_max_trafo_size                 = log2_diff_max_min_transform_block_size +
++                                               sps->log2_min_tb_size;
++
++    if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->log2_diff_max_min_coding_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    {
++        const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
++        // Not a bitstream limitation, but all profiles
++        if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
++            return AVERROR_INVALIDDATA;
++        }
++
++        // Inferred parameters
++        sps->log2_ctb_size = CtbLog2SizeY;
++//        sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
++    }
++
++    sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
++    sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
++
++    sps->scaling_list_enable_flag = get_bits1(gb);
++    if (sps->scaling_list_enable_flag) {
++        set_default_scaling_list_data(&sps->scaling_list);
++
++        if (get_bits1(gb)) {
++            ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
++            if (ret < 0)
++                return ret;
++        }
++    }
++
++    sps->amp_enabled_flag = get_bits1(gb);
++    sps->sao_enabled      = get_bits1(gb);
++
++    // Set pcm defaults (0) so we don't have to test _enabled when we
++    // want to use them
++    memset(&sps->pcm, 0, sizeof(sps->pcm));
++
++    if (get_bits1(gb))  // pcm_enabled_flag
++    {
++        const unsigned int limit_max_pcm = FFMIN(5,
++            sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
++        sps->pcm.bit_depth   = get_bits(gb, 4) + 1;
++        sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
++        sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
++        sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
++                                        get_ue_golomb_long(gb);
++        if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
++                   sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
++            return AVERROR_INVALIDDATA;
++        }
++        if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
++            sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
++            av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
++                   sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sps->pcm.loop_filter_disable_flag = get_bits1(gb);
++    }
++
++    // Could be based on min_pcm_cb_size but much easier logic if we just stick
++    // with 8 (and costs us little)
++    sps->pcm_width = (sps->width + 63) >> 6;  // 8 for min size, 8 bits per byte - round up
++    sps->pcm_height = (sps->height + 7) >> 3;
++
++    sps->nb_st_rps = get_ue_golomb_long(gb);
++    if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
++        av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
++               sps->nb_st_rps);
++        return AVERROR_INVALIDDATA;
++    }
++    for (i = 0; i < sps->nb_st_rps; i++) {
++        if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
++                                                 sps, 0)) < 0)
++            return ret;
++    }
++
++    sps->long_term_ref_pics_present_flag = get_bits1(gb);
++    if (sps->long_term_ref_pics_present_flag) {
++        sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
++        if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
++            av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
++                   sps->num_long_term_ref_pics_sps);
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
++            sps->lt_ref_pic_poc_lsb_sps[i]       = get_bits(gb, sps->log2_max_poc_lsb);
++            sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
++        }
++    }
++
++    sps->sps_temporal_mvp_enabled_flag          = get_bits1(gb);
++    sps->intra_filters_disable = get_bits1(gb) ? 0 : FILTER_STRONG; // sps->sps_strong_intra_smoothing_enable_flag
++    sps->vui.sar = (AVRational){0, 1};
++    vui_present = get_bits1(gb);
++    if (vui_present)
++        decode_vui(gb, avctx, apply_defdispwin, sps);
++
++    if (get_bits1(gb)) { // sps_extension_flag
++        int sps_extension_flag[1];
++        for (i = 0; i < 1; i++)
++            sps_extension_flag[i] = get_bits1(gb);
++        skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
++        if (sps_extension_flag[0]) {
++            int extended_precision_processing_flag;
++            int cabac_bypass_alignment_enabled_flag;
++
++            sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
++            sps->transform_skip_context_enabled_flag  = get_bits1(gb);
++            sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
++
++            extended_precision_processing_flag = get_bits1(gb);
++            if (extended_precision_processing_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "extended_precision_processing_flag not yet implemented\n");
++
++            if (get_bits1(gb))          // sps->intra_smoothing_disabled_flag
++                sps->intra_filters_disable |= FILTER_EITHER;
++            sps->high_precision_offsets_enabled_flag = get_bits1(gb);
++            sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
++
++            cabac_bypass_alignment_enabled_flag  = get_bits1(gb);
++            if (cabac_bypass_alignment_enabled_flag)
++                av_log(avctx, AV_LOG_WARNING,
++                   "cabac_bypass_alignment_enabled_flag not yet implemented\n");
++        }
++    }
++    if (apply_defdispwin) {
++        sps->output_window.left_offset   += sps->vui.def_disp_win.left_offset;
++        sps->output_window.right_offset  += sps->vui.def_disp_win.right_offset;
++        sps->output_window.top_offset    += sps->vui.def_disp_win.top_offset;
++        sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
++    }
++
++    ow = &sps->output_window;
++    if (ow->left_offset >= INT_MAX - ow->right_offset     ||
++        ow->top_offset  >= INT_MAX - ow->bottom_offset    ||
++        ow->left_offset + ow->right_offset  >= sps->width ||
++        ow->top_offset  + ow->bottom_offset >= sps->height) {
++        av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
++               ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
++        if (avctx->err_recognition & AV_EF_EXPLODE) {
++            return AVERROR_INVALIDDATA;
++        }
++        av_log(avctx, AV_LOG_WARNING,
++               "Displaying the whole video surface.\n");
++        memset(ow, 0, sizeof(*ow));
++        memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
++    }
++
++    // Inferred parameters
++
++    sps->ctb_width  = (sps->width  + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
++    sps->ctb_size   = sps->ctb_width * sps->ctb_height;
++
++    sps->min_cb_width  = sps->width  >> sps->log2_min_cb_size;
++    sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
++    sps->min_tb_width  = sps->width  >> sps->log2_min_tb_size;
++    sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
++    sps->min_pu_width  = sps->width  >> LOG2_MIN_PU_SIZE;
++    sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
++    sps->tb_mask       = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
++
++    sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
++    sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
++
++    if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
++        av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
++        av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
++               sps->max_transform_hierarchy_depth_inter);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
++        av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
++               sps->max_transform_hierarchy_depth_intra);
++        return AVERROR_INVALIDDATA;
++    }
++    if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
++        av_log(avctx, AV_LOG_ERROR,
++               "max transform block size out of range: %d\n",
++               sps->log2_max_trafo_size);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread SPS by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin)
++{
++    HEVCRpiSPS *sps;
++    AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
++    unsigned int sps_id;
++    int ret;
++    ptrdiff_t nal_size;
++
++    if (!sps_buf)
++        return AVERROR(ENOMEM);
++    sps = (HEVCRpiSPS*)sps_buf->data;
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(sps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(sps->data));
++        sps->data_size = sizeof(sps->data);
++    } else {
++        sps->data_size = nal_size;
++    }
++    memcpy(sps->data, gb->buffer, sps->data_size);
++
++    ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
++                            apply_defdispwin,
++                            ps->vps_list, avctx);
++    if (ret < 0) {
++        av_buffer_unref(&sps_buf);
++        return ret;
++    }
++
++    if (avctx->debug & FF_DEBUG_BITSTREAM) {
++        av_log(avctx, AV_LOG_DEBUG,
++               "Parsed SPS: id %d; coded wxh: %dx%d; "
++               "cropped wxh: %dx%d; pix_fmt: %s.\n",
++               sps_id, sps->width, sps->height,
++               sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
++               sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
++               av_get_pix_fmt_name(sps->pix_fmt));
++    }
++
++    /* check if this is a repeat of an already parsed SPS, then keep the
++     * original one.
++     * otherwise drop all PPSes that depend on it */
++    if (ps->sps_list[sps_id] &&
++        !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
++        av_buffer_unref(&sps_buf);
++    } else {
++        remove_sps(ps, sps_id);
++        ps->sps_list[sps_id] = sps_buf;
++    }
++
++    return 0;
++}
++
++static void hevc_pps_free(void *opaque, uint8_t *data)
++{
++    HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
++
++    av_freep(&pps->column_width);
++    av_freep(&pps->row_height);
++    av_freep(&pps->col_bd);
++    av_freep(&pps->row_bd);
++    av_freep(&pps->col_idxX);
++    av_freep(&pps->ctb_addr_rs_to_ts);
++    av_freep(&pps->ctb_addr_ts_to_rs);
++    av_freep(&pps->tile_pos_ts);
++    av_freep(&pps->tile_size);
++    av_freep(&pps->tile_id);
++    av_freep(&pps->ctb_ts_flags);
++
++    av_freep(&pps);
++}
++
++static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
++{
++    do
++    {
++        const int offset = get_se_golomb_long(gb);
++        if (offset < -12 || offset > 12) {
++            av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
++            return AVERROR_INVALIDDATA;
++        }
++        *offsets++ = offset;
++    } while (n_minus_1-- != 0);
++    return 0;
++}
++
++static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
++                                HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++    if (pps->transform_skip_enabled_flag) {
++        pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
++    }
++    pps->cross_component_prediction_enabled_flag = get_bits1(gb);
++    if (pps->cross_component_prediction_enabled_flag &&
++        (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
++    {
++        av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
++        return AVERROR_INVALIDDATA;
++    }
++    pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
++    if (pps->chroma_qp_offset_list_enabled_flag) {
++        int err;
++
++        pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
++        pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
++        if (pps->chroma_qp_offset_list_len_minus1 > 5) {
++            av_log(avctx, AV_LOG_ERROR,
++                   "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
++            return AVERROR_INVALIDDATA;
++        }
++        av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
++
++        if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
++            (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
++            return err;
++    }
++
++    {
++        const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
++
++        pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
++        if (pps->log2_sao_offset_scale_luma > max_offset) {
++            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
++            return AVERROR_INVALIDDATA;
++        }
++        pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
++        if (pps->log2_sao_offset_scale_chroma > max_offset) {
++            av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    return(0);
++}
++
++static inline int setup_pps(AVCodecContext * const avctx,
++                            HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
++{
++    int pic_area_in_ctbs;
++    int i, j, x, y, ctb_addr_rs, tile_id;
++
++    // Inferred parameters
++
++    // qp_y -> qp_u/qp_v tables
++    // The tables have at least -24,+24 overrun after adding offset here
++    // which should allow for clipless offseting
++
++    pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0;  // No offset for luma, but may be useful for general code
++    pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
++
++    if (sps->chroma_format_idc == 1) {
++        pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++        pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++    }
++    else
++    {
++        pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
++        pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
++        pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
++    }
++
++    pps->col_bd   = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
++    pps->row_bd   = av_malloc_array(pps->num_tile_rows + 1,    sizeof(*pps->row_bd));
++    pps->col_idxX = av_malloc_array(sps->ctb_width,    sizeof(*pps->col_idxX));
++    if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
++        return AVERROR(ENOMEM);
++
++    if (pps->uniform_spacing_flag) {
++        if (!pps->column_width) {
++            pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++            pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        }
++        if (!pps->column_width || !pps->row_height)
++            return AVERROR(ENOMEM);
++
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
++                                   (i * sps->ctb_width) / pps->num_tile_columns;
++        }
++
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
++                                 (i * sps->ctb_height) / pps->num_tile_rows;
++        }
++    }
++
++    {
++        const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
++        pps->col_bd[0] = 0;
++        pps->tile_wpp_inter_disable = 0;
++        for (i = 0; i < pps->num_tile_columns; i++)
++        {
++            pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
++
++            // Avoid trying tile parallel if the columns don't fall on cache boundries
++            // (this causes too much pain syncing flushes with the QPU)
++            // Ignore the final (RHS of pic) tile boundry
++            if ((pps->col_bd[i] & td_mask) != 0) {
++                pps->tile_wpp_inter_disable = 1;
++            }
++        }
++
++        // If we can start the next row before finishing the first line of
++        // this one then we must wait at the end of the tile
++        // * if this happens a lot then there are better but more complicated
++        //   conditions that we could apply
++        if (pps->tile_wpp_inter_disable) {
++            for (i = 0; i < pps->num_tile_rows; i++)
++            {
++                if (pps->row_height[i] <= RPI_MAX_JOBS) {
++                    pps->tile_wpp_inter_disable = 2;
++                    break;
++                }
++            }
++        }
++    }
++
++    pps->row_bd[0] = 0;
++    for (i = 0; i < pps->num_tile_rows; i++)
++        pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
++
++    for (i = 0, j = 0; i < sps->ctb_width; i++) {
++        if (i >= pps->col_bd[j + 1])
++            j++;
++        pps->col_idxX[i] = j;
++    }
++
++    /**
++     * 6.5
++     */
++    pic_area_in_ctbs     = sps->ctb_size;
++
++    pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_rs_to_ts));
++    pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_addr_ts_to_rs));
++    pps->tile_id           = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->tile_id));
++    pps->tile_size         = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
++    pps->tile_pos_ts       = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
++    pps->ctb_ts_flags      = av_malloc_array(pic_area_in_ctbs,    sizeof(*pps->ctb_ts_flags));
++    if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
++        !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
++        return AVERROR(ENOMEM);
++    }
++
++    memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
++
++    for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
++        int tb_x   = ctb_addr_rs % sps->ctb_width;
++        int tb_y   = ctb_addr_rs / sps->ctb_width;
++        int tile_x = 0;
++        int tile_y = 0;
++        int val    = 0;
++
++        for (i = 0; i < pps->num_tile_columns; i++) {
++            if (tb_x < pps->col_bd[i + 1]) {
++                tile_x = i;
++                break;
++            }
++        }
++
++        for (i = 0; i < pps->num_tile_rows; i++) {
++            if (tb_y < pps->row_bd[i + 1]) {
++                tile_y = i;
++                break;
++            }
++        }
++
++        for (i = 0; i < tile_x; i++)
++            val += pps->row_height[tile_y] * pps->column_width[i];
++        for (i = 0; i < tile_y; i++)
++            val += sps->ctb_width * pps->row_height[i];
++
++        val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
++               tb_x - pps->col_bd[tile_x];
++
++        pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
++        pps->ctb_addr_ts_to_rs[val]         = ctb_addr_rs;
++    }
++
++    {
++        uint8_t * pflags = pps->ctb_ts_flags;
++        uint16_t * ptid = pps->tile_id;
++
++        for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
++        {
++            for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
++            {
++                const unsigned int tile_w = pps->column_width[i];
++
++                pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                for (x = 0; x != tile_w; ++x) {
++                    pflags[x] |= CTB_TS_FLAGS_TOT;
++                }
++
++                for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
++                {
++                    pflags[0] |= CTB_TS_FLAGS_SOTL;
++
++                    if (pps->entropy_coding_sync_enabled_flag)
++                    {
++                        if (pps->column_width[i] != 1)
++                            pflags[1] |= CTB_TS_FLAGS_CSAVE;
++                        else
++                            pflags[0] |= CTB_TS_FLAGS_CIREQ;
++
++                        if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
++                            pflags[0] |= CTB_TS_FLAGS_CLOAD;
++                    }
++
++                    for (x = 0; x != tile_w; ++x)
++                        *ptid++ = tile_id;
++
++                    pflags += tile_w;
++                    pflags[-1] |= CTB_TS_FLAGS_EOTL;
++                    if (i + 1 == pps->num_tile_columns)
++                        pflags[-1] |= CTB_TS_FLAGS_EOL;
++                }
++
++                pflags[-1] |= CTB_TS_FLAGS_EOT;
++            }
++        }
++    }
++
++    {
++        unsigned int ts = 0;
++        for (j = 0; j < pps->num_tile_rows; j++)
++            for (i = 0; i < pps->num_tile_columns; i++)
++            {
++                const unsigned int size = pps->column_width[i] * pps->row_height[j];
++                pps->tile_size[j * pps->num_tile_columns + i] = size;
++                pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
++                ts += size;
++            }
++    }
++
++    return 0;
++}
++
++int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
++                           HEVCRpiParamSets * const ps)
++{
++    const HEVCRpiSPS *sps = NULL;
++    int i, ret = 0;
++    unsigned int pps_id = 0;
++    ptrdiff_t nal_size;
++    unsigned log2_parallel_merge_level_minus2;
++
++    AVBufferRef *pps_buf;
++    HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
++
++    if (!pps)
++        return AVERROR(ENOMEM);
++
++    pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
++                               hevc_pps_free, NULL, 0);
++    if (!pps_buf) {
++        av_freep(&pps);
++        return AVERROR(ENOMEM);
++    }
++
++    av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
++
++    nal_size = gb->buffer_end - gb->buffer;
++    if (nal_size > sizeof(pps->data)) {
++        av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
++               "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
++               nal_size, sizeof(pps->data));
++        pps->data_size = sizeof(pps->data);
++    } else {
++        pps->data_size = nal_size;
++    }
++    memcpy(pps->data, gb->buffer, pps->data_size);
++
++    // Default values
++    pps->loop_filter_across_tiles_enabled_flag = 1;
++    pps->num_tile_columns                      = 1;
++    pps->num_tile_rows                         = 1;
++    pps->uniform_spacing_flag                  = 1;
++    pps->disable_dbf                           = 0;
++    pps->beta_offset                           = 0;
++    pps->tc_offset                             = 0;
++    pps->log2_max_transform_skip_block_size    = 2;
++
++    // Coded parameters
++    pps_id = get_ue_golomb_long(gb);
++    if (pps_id >= HEVC_MAX_PPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->sps_id = get_ue_golomb_long(gb);
++    if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    if (!ps->sps_list[pps->sps_id]) {
++        av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
++
++    pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
++    pps->output_flag_present_flag              = get_bits1(gb);
++    pps->num_extra_slice_header_bits           = get_bits(gb, 3);
++
++    pps->sign_data_hiding_flag = get_bits1(gb);
++
++    pps->cabac_init_present_flag = get_bits1(gb);
++
++    pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
++    if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
++        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
++    if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
++        av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    pps->pic_init_qp_minus26 = get_se_golomb(gb);
++    if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
++        av_log(avctx, AV_LOG_ERROR,
++               "init_qp_minus26 %d is outside the valid range "
++               "[%d, %d].\n",
++               pps->pic_init_qp_minus26,
++               -(26 + sps->qp_bd_offset), 25);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    pps->constrained_intra_pred_flag = get_bits1(gb);
++    pps->transform_skip_enabled_flag = get_bits1(gb);
++
++    pps->cu_qp_delta_enabled_flag = get_bits1(gb);
++    pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
++    if (pps->cu_qp_delta_enabled_flag)
++    {
++        const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
++
++        if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
++            av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
++                   diff_cu_qp_delta_depth);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++
++        pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
++    }
++
++    pps->cb_qp_offset = get_se_golomb(gb);
++    if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
++               pps->cb_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->cr_qp_offset = get_se_golomb(gb);
++    if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
++        av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
++               pps->cr_qp_offset);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
++
++    pps->weighted_pred_flag   = get_bits1(gb);
++    pps->weighted_bipred_flag = get_bits1(gb);
++
++    pps->transquant_bypass_enable_flag    = get_bits1(gb);
++    pps->tiles_enabled_flag               = get_bits1(gb);
++    pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
++
++    if (pps->tiles_enabled_flag) {
++        pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
++        pps->num_tile_rows    = get_ue_golomb_long(gb) + 1;
++        if (pps->num_tile_columns <= 0 ||
++            pps->num_tile_columns >= sps->width) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
++                   pps->num_tile_columns - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++        if (pps->num_tile_rows <= 0 ||
++            pps->num_tile_rows >= sps->height) {
++            av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
++                   pps->num_tile_rows - 1);
++            ret = AVERROR_INVALIDDATA;
++            goto err;
++        }
++
++        pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
++        pps->row_height   = av_malloc_array(pps->num_tile_rows,    sizeof(*pps->row_height));
++        if (!pps->column_width || !pps->row_height) {
++            ret = AVERROR(ENOMEM);
++            goto err;
++        }
++
++        pps->uniform_spacing_flag = get_bits1(gb);
++        if (!pps->uniform_spacing_flag) {
++            uint64_t sum = 0;
++            for (i = 0; i < pps->num_tile_columns - 1; i++) {
++                pps->column_width[i] = get_ue_golomb_long(gb) + 1;
++                sum                 += pps->column_width[i];
++            }
++            if (sum >= sps->ctb_width) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
++
++            sum = 0;
++            for (i = 0; i < pps->num_tile_rows - 1; i++) {
++                pps->row_height[i] = get_ue_golomb_long(gb) + 1;
++                sum               += pps->row_height[i];
++            }
++            if (sum >= sps->ctb_height) {
++                av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
++        }
++        pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
++    }
++
++    pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++
++    pps->deblocking_filter_control_present_flag = get_bits1(gb);
++    if (pps->deblocking_filter_control_present_flag) {
++        pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
++        pps->disable_dbf                             = get_bits1(gb);
++        if (!pps->disable_dbf) {
++            int beta_offset_div2 = get_se_golomb(gb);
++            int tc_offset_div2   = get_se_golomb(gb) ;
++            if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
++                       beta_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
++                av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
++                       tc_offset_div2);
++                ret = AVERROR_INVALIDDATA;
++                goto err;
++            }
++            pps->beta_offset = 2 * beta_offset_div2;
++            pps->tc_offset   = 2 *   tc_offset_div2;
++        }
++    }
++
++    pps->scaling_list_data_present_flag = get_bits1(gb);
++    if (pps->scaling_list_data_present_flag) {
++        set_default_scaling_list_data(&pps->scaling_list);
++        ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
++        if (ret < 0)
++            goto err;
++    }
++    pps->lists_modification_present_flag = get_bits1(gb);
++    log2_parallel_merge_level_minus2     = get_ue_golomb_long(gb);
++    if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
++        av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
++               log2_parallel_merge_level_minus2);
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++    pps->log2_parallel_merge_level       = log2_parallel_merge_level_minus2 + 2;
++
++    pps->slice_header_extension_present_flag = get_bits1(gb);
++
++    if (get_bits1(gb)) { // pps_extension_present_flag
++        int pps_range_extensions_flag = get_bits1(gb);
++        skip_bits(gb, 7); // pps_extension_7bits
++        if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
++            if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
++                goto err;
++        }
++    }
++
++    ret = setup_pps(avctx, pps, sps);
++    if (ret < 0)
++        goto err;
++
++    if (get_bits_left(gb) < 0) {
++        av_log(avctx, AV_LOG_ERROR,
++               "Overread PPS by %d bits\n", -get_bits_left(gb));
++        ret = AVERROR_INVALIDDATA;
++        goto err;
++    }
++
++    remove_pps(ps, pps_id);
++    ps->pps_list[pps_id] = pps_buf;
++
++    return 0;
++
++err:
++    av_buffer_unref(&pps_buf);
++    return ret;
++}
++
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
++{
++    int max_poc_lsb  = 1 << sps->log2_max_poc_lsb;
++    int prev_poc_lsb = pocTid0 % max_poc_lsb;
++    int prev_poc_msb = pocTid0 - prev_poc_lsb;
++    int poc_msb;
++
++    if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
++        poc_msb = prev_poc_msb + max_poc_lsb;
++    else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
++        poc_msb = prev_poc_msb - max_poc_lsb;
++    else
++        poc_msb = prev_poc_msb;
++
++    // For BLA picture types, POCmsb is set to 0.
++    if (nal_unit_type == HEVC_NAL_BLA_W_LP   ||
++        nal_unit_type == HEVC_NAL_BLA_W_RADL ||
++        nal_unit_type == HEVC_NAL_BLA_N_LP)
++        poc_msb = 0;
++
++    return poc_msb + poc_lsb;
++}
+diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
+new file mode 100644
+index 0000000000..c725ebb9ca
+--- /dev/null
++++ b/libavcodec/rpi_hevc_ps.h
+@@ -0,0 +1,449 @@
++/*
++ * HEVC parameter set parsing
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_PS_H
++#define AVCODEC_RPI_HEVC_PS_H
++
++#include <stdint.h>
++
++#include "libavutil/buffer.h"
++#include "libavutil/pixfmt.h"
++#include "libavutil/rational.h"
++
++#include "avcodec.h"
++#include "get_bits.h"
++#include "hevc.h"
++
++typedef struct ShortTermRPS {
++    unsigned int num_negative_pics;
++    int num_delta_pocs;
++    int rps_idx_num_delta_pocs;
++    int32_t delta_poc[32];
++    uint8_t used[32];
++} ShortTermRPS;
++
++typedef struct LongTermRPS {
++    int     poc[32];
++    uint8_t used[32];
++    uint8_t nb_refs;
++} LongTermRPS;
++
++typedef struct RpiSliceHeader {
++    unsigned int pps_id;
++
++    ///< address (in raster order) of the first block in the current slice segment
++    unsigned int   slice_segment_addr;
++    ///< address (in raster order) of the first block in the current slice
++    unsigned int   slice_addr;
++
++    enum HEVCSliceType slice_type;
++
++    int pic_order_cnt_lsb;
++
++    uint8_t first_slice_in_pic_flag;
++    uint8_t dependent_slice_segment_flag;
++    uint8_t pic_output_flag;
++    uint8_t colour_plane_id;
++
++    ///< RPS coded in the slice header itself is stored here
++    int short_term_ref_pic_set_sps_flag;
++    int short_term_ref_pic_set_size;
++    ShortTermRPS slice_rps;
++    const ShortTermRPS *short_term_rps;
++    int long_term_ref_pic_set_size;
++    LongTermRPS long_term_rps;
++    unsigned int list_entry_lx[2][32];
++
++    uint8_t rpl_modification_flag[2];
++    uint8_t no_output_of_prior_pics_flag;
++    uint8_t slice_temporal_mvp_enabled_flag;
++
++    unsigned int nb_refs[2];
++
++    uint8_t slice_sample_adaptive_offset_flag[3];
++    uint8_t mvd_l1_zero_flag;
++
++    uint8_t cabac_init_flag;
++    uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
++    uint8_t slice_loop_filter_across_slices_enabled_flag;
++    uint8_t collocated_list;
++
++    uint8_t no_dblk_boundary_flags;
++
++    unsigned int collocated_ref_idx;
++
++    int slice_qp_delta;
++    int slice_cb_qp_offset;  // -12, +12
++    int slice_cr_qp_offset;  // -12, +12
++
++    uint8_t cu_chroma_qp_offset_enabled_flag;
++
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
++
++    unsigned *entry_point_offset;
++    int * offset;
++    int * size;
++    int num_entry_point_offsets;
++    int offsets_allocated;
++
++    uint8_t offload_wpp;
++    uint8_t offload_tiles;
++
++    int8_t slice_qp;
++
++    uint8_t luma_log2_weight_denom;
++    uint8_t chroma_log2_weight_denom;
++
++    int16_t luma_weight_l0[16];     // -128, +255
++    int16_t luma_offset_l0[16];
++    int16_t chroma_weight_l0[16][2];
++    int16_t chroma_offset_l0[16][2];
++
++    int16_t luma_weight_l1[16];
++    int16_t luma_offset_l1[16];
++    int16_t chroma_weight_l1[16][2];
++    int16_t chroma_offset_l1[16][2];
++
++} RpiSliceHeader;
++
++typedef struct HEVCRpiWindow {
++    uint16_t left_offset;
++    uint16_t right_offset;
++    uint16_t top_offset;
++    uint16_t bottom_offset;
++} HEVCRpiWindow;
++
++typedef struct VUI {
++    AVRational sar;
++
++    int overscan_info_present_flag;
++    int overscan_appropriate_flag;
++
++    int video_signal_type_present_flag;
++    int video_format;
++    int video_full_range_flag;
++    int colour_description_present_flag;
++    uint8_t colour_primaries;
++    uint8_t transfer_characteristic;
++    uint8_t matrix_coeffs;
++
++    int chroma_loc_info_present_flag;
++    int chroma_sample_loc_type_top_field;
++    int chroma_sample_loc_type_bottom_field;
++    int neutra_chroma_indication_flag;
++
++    int field_seq_flag;
++    int frame_field_info_present_flag;
++
++    int default_display_window_flag;
++    HEVCRpiWindow def_disp_win;
++
++    int vui_timing_info_present_flag;
++    uint32_t vui_num_units_in_tick;
++    uint32_t vui_time_scale;
++    int vui_poc_proportional_to_timing_flag;
++    int vui_num_ticks_poc_diff_one_minus1;
++    int vui_hrd_parameters_present_flag;
++
++    int bitstream_restriction_flag;
++    int tiles_fixed_structure_flag;
++    int motion_vectors_over_pic_boundaries_flag;
++    int restricted_ref_pic_lists_flag;
++    int min_spatial_segmentation_idc;
++    int max_bytes_per_pic_denom;
++    int max_bits_per_min_cu_denom;
++    int log2_max_mv_length_horizontal;
++    int log2_max_mv_length_vertical;
++} VUI;
++
++typedef struct PTLCommon {
++    uint8_t profile_space;
++    uint8_t tier_flag;
++    uint8_t profile_idc;
++    uint8_t profile_compatibility_flag[32];
++    uint8_t level_idc;
++    uint8_t progressive_source_flag;
++    uint8_t interlaced_source_flag;
++    uint8_t non_packed_constraint_flag;
++    uint8_t frame_only_constraint_flag;
++} PTLCommon;
++
++typedef struct PTL {
++    PTLCommon general_ptl;
++    PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
++
++    uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
++    uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
++} PTL;
++
++typedef struct HEVCRpiVPS {
++    uint8_t vps_temporal_id_nesting_flag;
++    int vps_max_layers;
++    int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
++
++    PTL ptl;
++    int vps_sub_layer_ordering_info_present_flag;
++    unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
++    unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
++    int vps_max_layer_id;
++    int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
++    uint8_t vps_timing_info_present_flag;
++    uint32_t vps_num_units_in_tick;
++    uint32_t vps_time_scale;
++    uint8_t vps_poc_proportional_to_timing_flag;
++    int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
++    int vps_num_hrd_parameters;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiVPS;
++
++typedef struct ScalingList {
++    /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
++     * and size ID 3 only has 2 arrays, not 6. */
++    uint8_t sl[4][6][64];
++    uint8_t sl_dc[2][6];
++} ScalingList;
++
++typedef struct HEVCRpiSPS {
++    unsigned vps_id;
++    uint8_t chroma_format_idc;
++    uint8_t separate_colour_plane_flag;
++
++    HEVCRpiWindow output_window;
++
++    HEVCRpiWindow pic_conf_win;
++
++    uint16_t wp_offset_half_range;  // WpOffsetHalfRange
++
++    uint8_t bit_depth;
++
++//    int bit_depth_chroma;  // We only support lum_bit_depth = chroma_bit_depth
++    uint8_t pixel_shift;
++    enum AVPixelFormat pix_fmt;
++
++    unsigned int log2_max_poc_lsb;
++
++    int max_sub_layers;
++    struct {
++        int max_dec_pic_buffering;
++        int num_reorder_pics;
++        int max_latency_increase;
++    } temporal_layer[HEVC_MAX_SUB_LAYERS];
++    uint8_t temporal_id_nesting_flag;
++
++    uint8_t scaling_list_enable_flag;
++    ScalingList scaling_list;
++
++    unsigned int nb_st_rps;
++    ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
++
++    uint8_t amp_enabled_flag;
++    uint8_t sao_enabled;
++
++    uint8_t long_term_ref_pics_present_flag;
++    uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
++    uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
++    uint8_t num_long_term_ref_pics_sps;
++
++    struct {
++        uint8_t bit_depth;
++        uint8_t bit_depth_chroma;
++        uint8_t log2_min_pcm_cb_size;
++        uint8_t log2_max_pcm_cb_size;
++        uint8_t loop_filter_disable_flag;
++    } pcm;
++    char sps_temporal_mvp_enabled_flag;
++//    char sps_strong_intra_smoothing_enable_flag;  -> intra_filtes_disable
++
++    uint8_t log2_min_cb_size;  // 3..6
++    uint8_t log2_diff_max_min_coding_block_size;
++    uint8_t log2_min_tb_size;  // 2..5
++    uint8_t log2_max_trafo_size;
++    uint8_t log2_ctb_size;     // 4..6
++//    unsigned int log2_min_pu_size;  // 2..5 (min_cb_size - 1)
++#define LOG2_MIN_PU_SIZE 2
++#define LOG2_MIN_CU_SIZE 3
++
++    uint8_t max_transform_hierarchy_depth_inter;
++    uint8_t max_transform_hierarchy_depth_intra;
++
++    char transform_skip_rotation_enabled_flag;
++    char transform_skip_context_enabled_flag;
++    char implicit_rdpcm_enabled_flag;
++    char explicit_rdpcm_enabled_flag;
++//    char intra_smoothing_disabled_flag;  -> intra_filtes_disable
++    char high_precision_offsets_enabled_flag;
++    char persistent_rice_adaptation_enabled_flag;
++
++    uint8_t intra_filters_disable;
++
++    ///< coded frame dimension in various units
++    int width;
++    int height;
++    int ctb_width;
++    int ctb_height;
++    int ctb_size;   // Pic size in CTBs not size of a CTB
++    int min_cb_width;
++    int min_cb_height;
++    int min_tb_width;
++    int min_tb_height;
++    int min_pu_width;
++    int min_pu_height;
++    int pcm_width;
++    int pcm_height;
++    int tb_mask;
++
++    int hshift[3];
++    int vshift[3];
++
++    int qp_bd_offset;
++
++    uint8_t data[4096];
++    int data_size;
++
++    VUI vui;
++    PTL ptl;
++} HEVCRpiSPS;
++
++#define CTB_TS_FLAGS_SOTL       (1U << 0)       // X start of tile line
++#define CTB_TS_FLAGS_EOTL       (1U << 1)       // Last CTB of a tile line
++#define CTB_TS_FLAGS_EOL        (1U << 2)       // Last CTB of a complete line
++#define CTB_TS_FLAGS_EOT        (1U << 3)       // Last CTB of a tile
++#define CTB_TS_FLAGS_CSAVE      (1U << 4)
++#define CTB_TS_FLAGS_CIREQ      (1U << 5)       // Cabac init request
++#define CTB_TS_FLAGS_TOT        (1U << 6)       // CTB on top row of a tile
++#define CTB_TS_FLAGS_CLOAD      (1U << 7)
++
++typedef struct HEVCRpiPPS {
++    unsigned int sps_id; ///< seq_parameter_set_id
++
++    uint8_t sign_data_hiding_flag;
++
++    uint8_t cabac_init_present_flag;
++
++    int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
++    int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
++    int pic_init_qp_minus26;
++
++    uint8_t constrained_intra_pred_flag;
++    uint8_t transform_skip_enabled_flag;
++
++    uint8_t cu_qp_delta_enabled_flag;
++    uint8_t log2_min_cu_qp_delta_size;
++    int cb_qp_offset;   // -12..12
++    int cr_qp_offset;   // -12..12
++    const uint8_t * qp_dblk_x[3];
++    const int8_t * qp_bd_x[3];
++
++    uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
++    uint8_t weighted_pred_flag;
++    uint8_t weighted_bipred_flag;
++    uint8_t output_flag_present_flag;
++    uint8_t transquant_bypass_enable_flag;
++
++    uint8_t dependent_slice_segments_enabled_flag;
++    uint8_t tiles_enabled_flag;
++    uint8_t entropy_coding_sync_enabled_flag;
++
++    uint8_t tile_wpp_inter_disable;
++    int num_tile_columns;   ///< num_tile_columns_minus1 + 1
++    int num_tile_rows;      ///< num_tile_rows_minus1 + 1
++    uint8_t uniform_spacing_flag;
++    uint8_t loop_filter_across_tiles_enabled_flag;
++
++    uint8_t seq_loop_filter_across_slices_enabled_flag;
++
++    uint8_t deblocking_filter_control_present_flag;
++    uint8_t deblocking_filter_override_enabled_flag;
++    uint8_t disable_dbf;
++    int beta_offset;    ///< beta_offset_div2 * 2
++    int tc_offset;      ///< tc_offset_div2 * 2
++
++    uint8_t scaling_list_data_present_flag;
++    ScalingList scaling_list;
++
++    uint8_t lists_modification_present_flag;
++    int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
++    int num_extra_slice_header_bits;
++    uint8_t slice_header_extension_present_flag;
++    uint8_t log2_max_transform_skip_block_size;
++    uint8_t cross_component_prediction_enabled_flag;
++    uint8_t chroma_qp_offset_list_enabled_flag;
++    uint8_t diff_cu_chroma_qp_offset_depth;
++    uint8_t chroma_qp_offset_list_len_minus1;
++    int8_t  cb_qp_offset_list[6];
++    int8_t  cr_qp_offset_list[6];
++    uint8_t log2_sao_offset_scale_luma;
++    uint8_t log2_sao_offset_scale_chroma;
++
++    // Inferred parameters
++    uint16_t *column_width;  ///< ColumnWidth
++    uint16_t *row_height;    ///< RowHeight
++    uint16_t *col_bd;        ///< ColBd
++    uint16_t *row_bd;        ///< RowBd
++    uint16_t *col_idxX;
++
++    // We can limit these to uint16_t given our other size limits
++    uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
++    uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
++    uint16_t *tile_id;           ///< TileId
++    uint16_t *tile_pos_ts;       ///< TilePosRS
++    uint16_t *tile_size;         ///< TileSize
++    uint8_t * ctb_ts_flags;
++
++    uint8_t data[4096];
++    int data_size;
++} HEVCRpiPPS;
++
++typedef struct HEVCRpiParamSets {
++    /* currently active parameter sets */
++    const HEVCRpiVPS *vps;
++    const HEVCRpiSPS *sps;
++    const HEVCRpiPPS *pps;
++
++    AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
++    AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
++    AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
++} HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
++int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps, int apply_defdispwin);
++int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
++                           HEVCRpiParamSets *ps);
++
++int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
++                                  ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
++
++int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
++                           uint8_t *buf, int buf_size);
++
++/**
++ * Compute POC of the current frame and return it.
++ */
++int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
++
++#endif /* AVCODEC_RPI_HEVC_PS_H */
+diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
+new file mode 100644
+index 0000000000..8cc5796cf0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_refs.c
+@@ -0,0 +1,485 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "internal.h"
++#include "thread.h"
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
++{
++    /* frame->frame can be NULL if context init failed */
++    if (!frame->frame || !frame->frame->buf[0])
++        return;
++
++    frame->flags &= ~flags;
++    if (!frame->flags) {
++        ff_thread_release_buffer(s->avctx, &frame->tf);
++
++        av_buffer_unref(&frame->col_mvf_buf);  // OK if already NULL
++        frame->col_mvf = NULL;
++
++        frame->collocated_ref = NULL;
++    }
++}
++
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i],
++                            HEVC_FRAME_FLAG_SHORT_REF |
++                            HEVC_FRAME_FLAG_LONG_REF);
++}
++
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
++{
++    int i;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++}
++
++static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
++{
++    int i, ret;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame * const frame = &s->DPB[i];
++        if (frame->frame->buf[0])
++            continue;
++
++        ret = ff_thread_get_buffer(s->avctx, &frame->tf,
++                                   AV_GET_BUFFER_FLAG_REF);
++        if (ret < 0)
++            return NULL;
++
++        frame->col_mvf = NULL;
++        frame->col_mvf_buf = NULL;
++        if (s->used_for_ref && !s->is_irap)
++        {
++            frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
++            if (!frame->col_mvf_buf)
++                goto fail;
++            frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
++        }
++
++        frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
++        frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
++
++        return frame;
++
++fail:
++        ff_hevc_rpi_unref_frame(s, frame, ~0);
++        return NULL;
++    }
++    av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
++    return NULL;
++}
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
++{
++    HEVCRpiFrame *ref;
++    int i;
++
++    /* check that this POC doesn't already exist */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++
++        if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
++            frame->poc == poc) {
++            av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
++                   poc);
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    ref = alloc_frame(s);
++    if (!ref)
++        return AVERROR(ENOMEM);
++
++    *frame = ref->frame;
++    s->ref = ref;
++
++    if (s->sh.pic_output_flag)
++        ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
++    else
++        ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
++
++    ref->poc      = poc;
++    ref->sequence = s->seq_decode;
++    ref->frame->crop_left   = s->ps.sps->output_window.left_offset;
++    ref->frame->crop_right  = s->ps.sps->output_window.right_offset;
++    ref->frame->crop_top    = s->ps.sps->output_window.top_offset;
++    ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
++
++    return 0;
++}
++
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
++{
++    do {
++        int nb_output = 0;
++        int min_poc   = INT_MAX;
++        int i, min_idx, ret;
++
++        if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
++            for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++                HEVCRpiFrame *frame = &s->DPB[i];
++                if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
++                        frame->sequence == s->seq_output) {
++                    ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++                }
++            }
++        }
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
++                frame->sequence == s->seq_output) {
++                nb_output++;
++                if (frame->poc < min_poc || nb_output == 1) {
++                    min_poc = frame->poc;
++                    min_idx = i;
++                }
++            }
++        }
++
++        /* wait for more frames before output */
++        if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
++            nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
++            return 0;
++
++        if (nb_output) {
++            HEVCRpiFrame *frame = &s->DPB[min_idx];
++            if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
++                return 0;
++
++            ret = av_frame_ref(out, frame->frame);
++            if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
++            else
++                ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
++            if (ret < 0)
++                return ret;
++            av_log(s->avctx, AV_LOG_DEBUG,
++                   "Output frame with POC %d.\n", frame->poc);
++            return 1;
++        }
++
++        if (s->seq_output != s->seq_decode)
++            s->seq_output = (s->seq_output + 1) & 0xff;
++        else
++            break;
++    } while (1);
++
++    return 0;
++}
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
++{
++    int dpb = 0;
++    int min_poc = INT_MAX;
++    int i;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++        if ((frame->flags) &&
++            frame->sequence == s->seq_output &&
++            frame->poc != s->poc) {
++            dpb++;
++        }
++    }
++
++    if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if ((frame->flags) &&
++                frame->sequence == s->seq_output &&
++                frame->poc != s->poc) {
++                if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
++                    min_poc = frame->poc;
++                }
++            }
++        }
++
++        for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++            HEVCRpiFrame *frame = &s->DPB[i];
++            if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
++                frame->sequence == s->seq_output &&
++                frame->poc <= min_poc) {
++                frame->flags |= HEVC_FRAME_FLAG_BUMPING;
++            }
++        }
++
++        dpb--;
++    }
++}
++
++static int init_slice_rpl(HEVCRpiContext *s)
++{
++    if (s->slice_idx >= s->rpl_tab_size)
++        return AVERROR_INVALIDDATA;
++
++    s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
++    return 0;
++}
++
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
++{
++    RpiSliceHeader *sh = &s->sh;
++
++    uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
++    uint8_t list_idx;
++    int i, j, ret;
++
++    ret = init_slice_rpl(s);
++    if (ret < 0)
++        return ret;
++
++    if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
++          s->rps[LT_CURR].nb_refs)) {
++        av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    for (list_idx = 0; list_idx < nb_list; list_idx++) {
++        RefPicList  rpl_tmp = { { 0 } };
++        RefPicList *rpl     = &s->refPicList[list_idx];
++
++        /* The order of the elements is
++         * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
++         * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
++        int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
++                              list_idx ? ST_CURR_BEF : ST_CURR_AFT,
++                              LT_CURR };
++
++        /* concatenate the candidate lists for the current frame */
++        while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
++            for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
++                RefPicList *rps = &s->rps[cand_lists[i]];
++                for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
++                    rpl_tmp.list[rpl_tmp.nb_refs]       = rps->list[j];
++                    rpl_tmp.ref[rpl_tmp.nb_refs]        = rps->ref[j];
++                    rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
++                    rpl_tmp.nb_refs++;
++                }
++            }
++        }
++
++        /* reorder the references if necessary */
++        if (sh->rpl_modification_flag[list_idx]) {
++            for (i = 0; i < sh->nb_refs[list_idx]; i++) {
++                int idx = sh->list_entry_lx[list_idx][i];
++
++                if (idx >= rpl_tmp.nb_refs) {
++                    av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                rpl->list[i]       = rpl_tmp.list[idx];
++                rpl->ref[i]        = rpl_tmp.ref[idx];
++                rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
++                rpl->nb_refs++;
++            }
++        } else {
++            memcpy(rpl, &rpl_tmp, sizeof(*rpl));
++            rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
++        }
++
++        if (sh->collocated_list == list_idx &&
++            sh->collocated_ref_idx < rpl->nb_refs)
++            s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
++    }
++
++    return 0;
++}
++
++static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
++{
++    int i;
++    int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
++            if ((ref->poc & LtMask) == poc)
++                return ref;
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *ref = &s->DPB[i];
++        if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
++            if (ref->poc == poc || (ref->poc & LtMask) == poc)
++                return ref;
++        }
++    }
++
++    if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Could not find ref with POC %d\n", poc);
++    return NULL;
++}
++
++static void mark_ref(HEVCRpiFrame *frame, int flag)
++{
++    frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
++    frame->flags |= flag;
++}
++
++static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
++{
++    HEVCRpiFrame *frame;
++    int i, x, y;
++
++    frame = alloc_frame(s);
++    if (!frame)
++        return NULL;
++
++    if (!s->ps.sps->pixel_shift) {
++        for (i = 0; frame->frame->buf[i]; i++)
++            memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
++                   frame->frame->buf[i]->size);
++    } else {
++        for (i = 0; frame->frame->data[i]; i++)
++            for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
++                for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
++                    AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
++                            1 << (s->ps.sps->bit_depth - 1));
++                }
++    }
++
++    frame->poc      = poc;
++    frame->sequence = s->seq_decode;
++    frame->flags    = 0;
++
++    ff_hevc_rpi_progress_set_all_done(frame);
++
++    return frame;
++}
++
++/* add a reference with the given poc to the list and mark it as used in DPB */
++static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
++                             int poc, int ref_flag)
++{
++    HEVCRpiFrame *ref = find_ref_idx(s, poc);
++
++    if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
++        return AVERROR_INVALIDDATA;
++
++    if (!ref) {
++        ref = generate_missing_ref(s, poc);
++        if (!ref)
++            return AVERROR(ENOMEM);
++    }
++
++    list->list[list->nb_refs] = ref->poc;
++    list->ref[list->nb_refs]  = ref;
++    list->nb_refs++;
++
++    mark_ref(ref, ref_flag);
++    return 0;
++}
++
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
++{
++    const ShortTermRPS *short_rps = s->sh.short_term_rps;
++    const LongTermRPS  *long_rps  = &s->sh.long_term_rps;
++    RefPicList               *rps = s->rps;
++    int i, ret = 0;
++
++    if (!short_rps) {
++        rps[0].nb_refs = rps[1].nb_refs = 0;
++        return 0;
++    }
++
++    /* clear the reference flags on all frames except the current one */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        HEVCRpiFrame *frame = &s->DPB[i];
++
++        if (frame == s->ref)
++            continue;
++
++        mark_ref(frame, 0);
++    }
++
++    for (i = 0; i < NB_RPS_TYPE; i++)
++        rps[i].nb_refs = 0;
++
++    /* add the short refs */
++    for (i = 0; i < short_rps->num_delta_pocs; i++) {
++        int poc = s->poc + short_rps->delta_poc[i];
++        int list;
++
++        if (!short_rps->used[i])
++            list = ST_FOLL;
++        else if (i < short_rps->num_negative_pics)
++            list = ST_CURR_BEF;
++        else
++            list = ST_CURR_AFT;
++
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
++        if (ret < 0)
++            goto fail;
++    }
++
++    /* add the long refs */
++    for (i = 0; i < long_rps->nb_refs; i++) {
++        int poc  = long_rps->poc[i];
++        int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
++
++        ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
++        if (ret < 0)
++            goto fail;
++    }
++
++fail:
++    /* release any frames that are now unused */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
++
++    return ret;
++}
++
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
++{
++    int ret = 0;
++    int i;
++    const ShortTermRPS *rps = s->sh.short_term_rps;
++    LongTermRPS *long_rps   = &s->sh.long_term_rps;
++
++    if (rps) {
++        for (i = 0; i < rps->num_negative_pics; i++)
++            ret += !!rps->used[i];
++        for (; i < rps->num_delta_pocs; i++)
++            ret += !!rps->used[i];
++    }
++
++    if (long_rps) {
++        for (i = 0; i < long_rps->nb_refs; i++)
++            ret += !!long_rps->used[i];
++    }
++    return ret;
++}
+diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
+new file mode 100644
+index 0000000000..cd8149d58e
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.c
+@@ -0,0 +1,368 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2013 Vittorio Giovara
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "golomb.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++
++static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
++{
++    int cIdx, i;
++    uint8_t hash_type;
++    //uint16_t picture_crc;
++    //uint32_t picture_checksum;
++    hash_type = get_bits(gb, 8);
++
++    for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
++        if (hash_type == 0) {
++            s->is_md5 = 1;
++            for (i = 0; i < 16; i++)
++                s->md5[cIdx][i] = get_bits(gb, 8);
++        } else if (hash_type == 1) {
++            // picture_crc = get_bits(gb, 16);
++            skip_bits(gb, 16);
++        } else if (hash_type == 2) {
++            // picture_checksum = get_bits_long(gb, 32);
++            skip_bits(gb, 32);
++        }
++    }
++    return 0;
++}
++
++static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
++{
++    int i;
++    // Mastering primaries
++    for (i = 0; i < 3; i++) {
++        s->display_primaries[i][0] = get_bits(gb, 16);
++        s->display_primaries[i][1] = get_bits(gb, 16);
++    }
++    // White point (x, y)
++    s->white_point[0] = get_bits(gb, 16);
++    s->white_point[1] = get_bits(gb, 16);
++
++    // Max and min luminance of mastering display
++    s->max_luminance = get_bits_long(gb, 32);
++    s->min_luminance = get_bits_long(gb, 32);
++
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return 0;
++}
++
++static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
++{
++    // Max and average light levels
++    s->max_content_light_level     = get_bits_long(gb, 16);
++    s->max_pic_average_light_level = get_bits_long(gb, 16);
++    // As this SEI message comes before the first frame that references it,
++    // initialize the flag to 2 and decrement on IRAP access unit so it
++    // persists for the coded video sequence (e.g., between two IRAPs)
++    s->present = 2;
++    return  0;
++}
++
++static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
++{
++    get_ue_golomb_long(gb);             // frame_packing_arrangement_id
++    s->present = !get_bits1(gb);
++
++    if (s->present) {
++        s->arrangement_type               = get_bits(gb, 7);
++        s->quincunx_subsampling           = get_bits1(gb);
++        s->content_interpretation_type    = get_bits(gb, 6);
++
++        // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
++        skip_bits(gb, 3);
++        s->current_frame_is_frame0_flag = get_bits1(gb);
++        // frame0_self_contained_flag, frame1_self_contained_flag
++        skip_bits(gb, 2);
++
++        if (!s->quincunx_subsampling && s->arrangement_type != 5)
++            skip_bits(gb, 16);  // frame[01]_grid_position_[xy]
++        skip_bits(gb, 8);       // frame_packing_arrangement_reserved_byte
++        skip_bits1(gb);         // frame_packing_arrangement_persistence_flag
++    }
++    skip_bits1(gb);             // upsampled_aspect_ratio_flag
++    return 0;
++}
++
++static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
++{
++    s->present = !get_bits1(gb);
++
++    if (s->present) {
++        s->hflip = get_bits1(gb);     // hor_flip
++        s->vflip = get_bits1(gb);     // ver_flip
++
++        s->anticlockwise_rotation = get_bits(gb, 16);
++        skip_bits1(gb);     // display_orientation_persistence_flag
++    }
++
++    return 0;
++}
++
++static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
++                                     void *logctx, int size)
++{
++    HEVCSEIPictureTiming *h = &s->picture_timing;
++    HEVCRpiSPS *sps;
++
++    if (!ps->sps_list[s->active_seq_parameter_set_id])
++        return(AVERROR(ENOMEM));
++    sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
++
++    if (sps->vui.frame_field_info_present_flag) {
++        int pic_struct = get_bits(gb, 4);
++        h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
++        if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
++            av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
++        } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
++            av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
++            h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
++        }
++        get_bits(gb, 2);                   // source_scan_type
++        get_bits(gb, 1);                   // duplicate_flag
++        skip_bits1(gb);
++        size--;
++    }
++    skip_bits_long(gb, 8 * size);
++
++    return 0;
++}
++
++static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
++                                                      int size)
++{
++    int flag;
++    int user_data_type_code;
++    int cc_count;
++
++    if (size < 3)
++       return AVERROR(EINVAL);
++
++    user_data_type_code = get_bits(gb, 8);
++    if (user_data_type_code == 0x3) {
++        skip_bits(gb, 1); // reserved
++
++        flag = get_bits(gb, 1); // process_cc_data_flag
++        if (flag) {
++            skip_bits(gb, 1);
++            cc_count = get_bits(gb, 5);
++            skip_bits(gb, 8); // reserved
++            size -= 2;
++
++            if (cc_count && size >= cc_count * 3) {
++                const uint64_t new_size = (s->a53_caption_size + cc_count
++                                           * UINT64_C(3));
++                int i, ret;
++
++                if (new_size > INT_MAX)
++                    return AVERROR(EINVAL);
++
++                /* Allow merging of the cc data from two fields. */
++                ret = av_reallocp(&s->a53_caption, new_size);
++                if (ret < 0)
++                    return ret;
++
++                for (i = 0; i < cc_count; i++) {
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                    s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
++                }
++                skip_bits(gb, 8); // marker_bits
++            }
++        }
++    } else {
++        int i;
++        for (i = 0; i < size - 1; i++)
++            skip_bits(gb, 8);
++    }
++
++    return 0;
++}
++
++static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
++                                                         int size)
++{
++    uint32_t country_code;
++    uint32_t user_identifier;
++
++    if (size < 7)
++        return AVERROR(EINVAL);
++    size -= 7;
++
++    country_code = get_bits(gb, 8);
++    if (country_code == 0xFF) {
++        skip_bits(gb, 8);
++        size--;
++    }
++
++    skip_bits(gb, 8);
++    skip_bits(gb, 8);
++
++    user_identifier = get_bits_long(gb, 32);
++
++    switch (user_identifier) {
++        case MKBETAG('G', 'A', '9', '4'):
++            return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
++        default:
++            skip_bits_long(gb, size * 8);
++            break;
++    }
++    return 0;
++}
++
++static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
++{
++    int num_sps_ids_minus1;
++    int i;
++    unsigned active_seq_parameter_set_id;
++
++    get_bits(gb, 4); // active_video_parameter_set_id
++    get_bits(gb, 1); // self_contained_cvs_flag
++    get_bits(gb, 1); // num_sps_ids_minus1
++    num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
++
++    if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
++        av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
++        return AVERROR_INVALIDDATA;
++    }
++
++    active_seq_parameter_set_id = get_ue_golomb_long(gb);
++    if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
++        av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
++        return AVERROR_INVALIDDATA;
++    }
++    s->active_seq_parameter_set_id = active_seq_parameter_set_id;
++
++    for (i = 1; i <= num_sps_ids_minus1; i++)
++        get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
++
++    return 0;
++}
++
++static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
++{
++    s->present = 1;
++    s->preferred_transfer_characteristics = get_bits(gb, 8);
++    return 0;
++}
++
++static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
++                                 int type, int size)
++{
++    switch (type) {
++    case 256:  // Mismatched value from HM 8.1
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    case HEVC_SEI_TYPE_FRAME_PACKING:
++        return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
++    case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
++        return decode_nal_sei_display_orientation(&s->display_orientation, gb);
++    case HEVC_SEI_TYPE_PICTURE_TIMING:
++        return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
++    case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
++        return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
++    case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
++        return decode_nal_sei_content_light_info(&s->content_light, gb);
++    case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
++        return decode_nal_sei_active_parameter_sets(s, gb, logctx);
++    case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
++        return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
++    case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
++        return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
++}
++
++static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                                 int type, int size)
++{
++    switch (type) {
++    case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
++        return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
++    default:
++        av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
++        skip_bits_long(gb, 8 * size);
++        return 0;
++    }
++}
++
++static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
++                                  const HEVCRpiParamSets * const ps, const int nal_unit_type)
++{
++    int payload_type = 0;
++    int payload_size = 0;
++    int byte = 0xFF;
++    av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
++
++    while (byte == 0xFF) {
++       if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
++           return AVERROR_INVALIDDATA;
++        byte          = get_bits(gb, 8);
++        payload_type += byte;
++    }
++    byte = 0xFF;
++    while (byte == 0xFF) {
++        if (get_bits_left(gb) < 8 + 8LL*payload_size)
++            return AVERROR_INVALIDDATA;
++         byte          = get_bits(gb, 8);
++        payload_size += byte;
++    }
++    if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
++        return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
++    } else { /* nal_unit_type == NAL_SEI_SUFFIX */
++        return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
++    }
++}
++
++static int more_rbsp_data(GetBitContext *gb)
++{
++    return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
++}
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const HEVCRpiParamSets *ps, int type)
++{
++    int ret;
++
++    do {
++        ret = decode_nal_sei_message(gb, logctx, s, ps, type);
++        if (ret < 0)
++            return ret;
++    } while (more_rbsp_data(gb));
++    return 1;
++}
++
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
++{
++    s->a53_caption.a53_caption_size = 0;
++    av_freep(&s->a53_caption.a53_caption);
++}
+diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
+new file mode 100644
+index 0000000000..d4ac348df9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_sei.h
+@@ -0,0 +1,135 @@
++/*
++ * HEVC Supplementary Enhancement Information messages
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVC_SEI_H
++#define AVCODEC_RPI_HEVC_SEI_H
++
++#include <stdint.h>
++
++#include "libavutil/md5.h"
++
++#include "get_bits.h"
++
++/**
++ * SEI message types
++ */
++typedef enum {
++    HEVC_SEI_TYPE_BUFFERING_PERIOD                     = 0,
++    HEVC_SEI_TYPE_PICTURE_TIMING                       = 1,
++    HEVC_SEI_TYPE_PAN_SCAN_RECT                        = 2,
++    HEVC_SEI_TYPE_FILLER_PAYLOAD                       = 3,
++    HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35       = 4,
++    HEVC_SEI_TYPE_USER_DATA_UNREGISTERED               = 5,
++    HEVC_SEI_TYPE_RECOVERY_POINT                       = 6,
++    HEVC_SEI_TYPE_SCENE_INFO                           = 9,
++    HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT                  = 15,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
++    HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
++    HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS           = 19,
++    HEVC_SEI_TYPE_POST_FILTER_HINT                     = 22,
++    HEVC_SEI_TYPE_TONE_MAPPING_INFO                    = 23,
++    HEVC_SEI_TYPE_FRAME_PACKING                        = 45,
++    HEVC_SEI_TYPE_DISPLAY_ORIENTATION                  = 47,
++    HEVC_SEI_TYPE_SOP_DESCRIPTION                      = 128,
++    HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS                = 129,
++    HEVC_SEI_TYPE_DECODING_UNIT_INFO                   = 130,
++    HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX                = 131,
++    HEVC_SEI_TYPE_DECODED_PICTURE_HASH                 = 132,
++    HEVC_SEI_TYPE_SCALABLE_NESTING                     = 133,
++    HEVC_SEI_TYPE_REGION_REFRESH_INFO                  = 134,
++    HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO               = 137,
++    HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO             = 144,
++    HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
++} HEVC_SEI_Type;
++
++typedef struct HEVCSEIPictureHash {
++    uint8_t       md5[3][16];
++    uint8_t is_md5;
++} HEVCSEIPictureHash;
++
++typedef struct HEVCSEIFramePacking {
++    int present;
++    int arrangement_type;
++    int content_interpretation_type;
++    int quincunx_subsampling;
++    int current_frame_is_frame0_flag;
++} HEVCSEIFramePacking;
++
++typedef struct HEVCSEIDisplayOrientation {
++    int present;
++    int anticlockwise_rotation;
++    int hflip, vflip;
++} HEVCSEIDisplayOrientation;
++
++typedef struct HEVCSEIPictureTiming {
++    int picture_struct;
++} HEVCSEIPictureTiming;
++
++typedef struct HEVCSEIA53Caption {
++    int a53_caption_size;
++    uint8_t *a53_caption;
++} HEVCSEIA53Caption;
++
++typedef struct HEVCSEIMasteringDisplay {
++    int present;
++    uint16_t display_primaries[3][2];
++    uint16_t white_point[2];
++    uint32_t max_luminance;
++    uint32_t min_luminance;
++} HEVCSEIMasteringDisplay;
++
++typedef struct HEVCSEIContentLight {
++    int present;
++    uint16_t max_content_light_level;
++    uint16_t max_pic_average_light_level;
++} HEVCSEIContentLight;
++
++typedef struct HEVCSEIAlternativeTransfer {
++    int present;
++    int preferred_transfer_characteristics;
++} HEVCSEIAlternativeTransfer;
++
++typedef struct HEVCSEIContext {
++    HEVCSEIPictureHash picture_hash;
++    HEVCSEIFramePacking frame_packing;
++    HEVCSEIDisplayOrientation display_orientation;
++    HEVCSEIPictureTiming picture_timing;
++    HEVCSEIA53Caption a53_caption;
++    HEVCSEIMasteringDisplay mastering_display;
++    HEVCSEIContentLight content_light;
++    int active_seq_parameter_set_id;
++    HEVCSEIAlternativeTransfer alternative_transfer;
++} HEVCSEIContext;
++
++struct HEVCRpiParamSets;
++
++int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
++                           const struct HEVCRpiParamSets *ps, int type);
++
++/**
++ * Reset SEI values that are stored on the Context.
++ * e.g. Caption data that was extracted during NAL
++ * parsing.
++ *
++ * @param s HEVCRpiContext.
++ */
++void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
++
++#endif /* AVCODEC_RPI_HEVC_SEI_H */
+diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
+new file mode 100644
+index 0000000000..23b49a99ae
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.c
+@@ -0,0 +1,1537 @@
++#include "rpi_hevc_shader.h"
++
++#ifdef _MSC_VER
++   #include <stdint.h>
++   /* cast through uintptr_t to avoid warnings */
++   #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
++#else
++   #define POINTER_TO_UINT(X) ((unsigned int)(X))
++#endif
++
++#ifdef __cplusplus
++extern "C" { /* the types are probably wrong... */
++#endif
++#ifdef __cplusplus
++}
++#endif
++
++#ifdef _MSC_VER
++__declspec(align(8))
++#elif defined(__GNUC__)
++__attribute__((aligned(8)))
++#endif
++unsigned int ff_hevc_rpi_shader[] = {
++// ::mc_setup_c_q0
++// ::mc_start
++/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c_qn
++/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
++/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
++/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
++/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
++/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
++/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000110] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
++/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
++/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
++/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
++/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
++// :1
++/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
++/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
++/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
++/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
++// ::mc_filter_c_p
++/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
++/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
++/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_p_l1
++/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
++/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
++/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
++/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c_b
++/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
++/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
++/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
++/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
++/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
++/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
++/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
++/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
++/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
++/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
++/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
++/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
++/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
++/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
++/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
++/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++// :1
++/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
++/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
++/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
++/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
++/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
++/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
++/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
++/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
++/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
++/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
++/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
++/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
++/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_sync_q0
++/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q1
++/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q2
++/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q3
++/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q4
++/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q5
++/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q6
++/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q7
++/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync_q8
++/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q9
++/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q10
++/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync_q11
++/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_qn
++// ::mc_exit_y_qn
++/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c_q0
++// ::mc_exit_y_q0
++/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y_q0
++/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y_qn
++/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
++/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
++/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
++/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
++/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
++/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
++// :1
++/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
++/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
++/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
++/* [0x00000df0] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
++/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
++/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
++/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
++/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
++/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
++/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
++// :per_block_setup_8
++/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
++/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
++/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
++/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
++/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
++/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
++/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
++/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
++/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
++/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
++/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
++/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
++/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
++/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
++/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
++/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++// ::mc_filter_y_pxx
++/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
++/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_bxx
++/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
++/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
++/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
++/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
++/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
++/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_p00
++/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
++/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
++/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
++/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
++/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
++/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
++/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
++/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y_b00
++/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
++/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
++/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
++/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_setup_c10_q0
++/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_c10_qn
++/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30      ; mov ra_base, unif
++/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
++/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
++/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
++/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
++/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
++/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop                           ; mul24 r0, r0, 5
++/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
++/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
++/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
++/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0                 ; mov ra_y, ra0.16a
++/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
++/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
++/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1                ; mov ra0, unif
++/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
++/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00001770] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
++/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a
++/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0                 ; mov rb_base2, unif
++/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
++/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1                ; mov r3, PREREAD
++/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1                ; mov r2, ra_y2
++/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0    ; mov r0, ra_y
++// :1
++/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0                    ; mov rb4, 0
++/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0                    ; mov rb5, 0
++/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0                    ; mov rb6, 0
++/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0                    ; mov rb7, 0
++// ::mc_filter_c10_p
++/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask
++/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_p_l1
++/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0
++/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x         ; mov ra_width_height, unif
++/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch          ; mov ra0, unif
++/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1                ; mov ra3, unif
++/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0     ; mov r1, ra_height
++/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif
++/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif
++/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2                ; mov r2, ra_fir_off_val
++/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c
++/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
++/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
++/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d              ; mov ra_link, unif
++// :1
++/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next
++/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax
++/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3                ; mul24 r0, ra7, rb10
++/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b
++/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0                ; mul24 r0, rb4, ra3.8a
++/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0                ; mul24 r0, ra7, rb11
++/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
++/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6                 ; mov r3, ra_blk_height
++/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
++/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_c10_b
++/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init     ; mov ra2, unif
++/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef      ; mov r3, unif
++/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1
++/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch          ; mov ra_width_height, unif
++/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x          ; mov ra0, unif
++/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4                ; mov ra2, unif
++/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul
++/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1                ; mov r1, ra_height
++/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next
++/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
++/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
++/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift     ; mov ra3, unif
++/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2                ; mov r3, unif
++/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a
++/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
++/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift         ; mov ra1, unif
++/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x         ; mov ra3, unif
++/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif
++/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5                ; mov ra9, rb_max_y
++/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif
++/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1                ; mov r5rep, -4
++/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1                ; mov r1, ra_wt_off_l1
++/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0         ; mov ra_link, unif
++/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++// :1
++/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next
++/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y             ; mov r3, ra_y
++/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0             ; mov      r0, r1 << 15
++/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask
++/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10              ; mov rb5, rb6
++/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift         ; mov r3, ra_y2
++/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7
++/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax
++/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b
++/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount     ; mov r0, ra4
++/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra7,  rb7
++/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0                ; mul24 r0, ra11, rb11
++/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
++/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
++/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2                ; mov r3, ra_blk_height
++/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
++/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
++/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_sync10_q0
++/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q1
++/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q2
++/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q3
++/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q4
++/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q5
++/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q6
++/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q7
++/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_sync10_q8
++/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q9
++/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q10
++/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_sync10_q11
++/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
++/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov  dst, srel(i)
++/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_q0
++// ::mc_exit_y10_q0
++/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov  dst, sacq(i)
++/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
++/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_exit_c10_qn
++// ::mc_exit_y10_qn
++/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
++// :1
++/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop                   ; nop           ; ldtmu0
++/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop                   ; nop           ; ldtmu1
++/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov  -, vw_wait
++/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop                   ; nop           ; thrend
++/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
++/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
++// ::mc_setup_y10_q0
++/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov  dst, srel(i)
++// ::mc_setup_y10_qn
++/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1                  ; mov ra0, unif
++/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
++/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
++/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30      ; mov ra11, unif
++/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
++/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
++/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
++/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
++/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
++/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
++/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef              ; mov rb_xpitch, unif
++/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
++/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
++/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
++/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num              ; mov rb_pitch, unif
++/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
++/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or  rb_dma1_base, r1, rb_pitch
++/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
++/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4                ; v8subs r2, r2, r2
++/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
++/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
++/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
++/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
++/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop                           ; mov r0, ra0.16a
++/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD               ; mov r2, ra1.16a
++// :1
++/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
++/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
++/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1          ; mov ra_y, r0
++/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
++/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
++/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
++/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1         ; mov ra_y2, r2
++/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
++/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
++/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
++/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
++/* [0x00002428] */ 0x159e7040, 0x10020827, // or  r0, r0, r1
++/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
++/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
++/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
++/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
++/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
++/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
++/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8,  0                   ; mov rb8,  0
++/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9,  0                   ; mov rb9,  0
++/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0                   ; mov rb10, 0
++/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0                   ; mov rb11, 0
++// :per_block_setup_10
++/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
++/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch          ; mov ra_base_next, unif
++/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2                ; mov ra_y_next, ra0.16a
++/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1                ; mov ra1, unif
++/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
++/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
++/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x          ; mov rb_base2_next, unif
++/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
++/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4                ; mov ra_width_height, unif
++/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2                ; mov vw_setup, rb_vpm_init
++/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul
++/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
++/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
++/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add
++/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val
++/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
++/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif
++/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3                ; mov rb5, ra_k255
++/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
++/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
++/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
++/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
++/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
++/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
++/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif
++/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
++/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d            ; mov ra_dest, unif
++/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
++/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
++/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
++/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
++/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
++/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
++/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8
++/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
++/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
++/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d            ; mov ra_link, unif
++/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++// ::mc_filter_y10_pxx
++/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
++/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height
++/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1                ; v8subs r0, ra_height, r3
++/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
++/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
++/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_p00
++/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif                 ; mov r0, elem_num
++/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
++/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0           ; mov ra_base_next, unif
++/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
++/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5                ; mov ra_y_next, ra0.16a
++/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x          ; mov ra_width_height, unif
++/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
++/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
++/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif
++/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
++/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1                ; mov ra_dest, unif
++/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
++/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
++/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
++/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif
++/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
++// :1
++/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
++/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
++/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3        ; mov vw_setup, rb_dma1
++/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3        ; mov vw_addr, ra_dest
++/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
++/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_bxx
++/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
++/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++// :1
++/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0               ; mov r1, 0
++/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y          ; mov r3, ra_k1
++/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2          ; mov rb5,  rb6
++/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift         ; mov rb6,  rb7
++/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1
++/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2        ; mov rb7, ra8
++/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2         ; mov ra8,  ra9
++/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra8,  rb8
++/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0                ; mul24 r0, ra11, rb11
++/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
++/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0                ; mov r2, rb_wt_off
++/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
++/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2                ; mov r0, r1 << 8
++/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0                ; mov r3, ra_blk_height
++/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch
++/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0                ; v8subs r0, ra_height, r3
++/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
++/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_filter_y10_b00
++/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
++/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif                 ; mov r3, elem_num
++/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2
++/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
++/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0
++/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
++/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++// :1
++/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
++/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
++/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
++/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax
++/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8                 ; mov r3, ra_blk_height
++/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
++/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
++/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax           ; mov -, vw_wait
++/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0        ; mov vw_setup, ra_dma0
++/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
++/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3                ; mov vw_setup, rb_dma1
++/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3                ; mov vw_addr, ra_dest
++/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
++/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
++/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
++/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
++/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init
++// ::mc_end
++};
++#ifdef __HIGHC__
++#pragma Align_to(8, ff_hevc_rpi_shader)
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
+new file mode 100644
+index 0000000000..79651c9b6c
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.h
+@@ -0,0 +1,63 @@
++#ifndef rpi_hevc_shader_H
++#define rpi_hevc_shader_H
++
++extern unsigned int ff_hevc_rpi_shader[];
++
++#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
++#define mc_start (ff_hevc_rpi_shader + 0)
++#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
++#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
++#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
++#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
++#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
++#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
++#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
++#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
++#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
++#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
++#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
++#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
++#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
++#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
++#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
++#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
++#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
++#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
++#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
++#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
++#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
++#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
++#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
++#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
++#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
++#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
++#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
++#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
++#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
++#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
++#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
++#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
++#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
++#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
++#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
++#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
++#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
++#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
++#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
++#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
++#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
++#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
++#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
++#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
++#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
++#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
++#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
++#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
++#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
++#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
++#define mc_end (ff_hevc_rpi_shader + 2860)
++
++#endif
+diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
+new file mode 100644
+index 0000000000..af5b59e181
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader.qasm
+@@ -0,0 +1,1850 @@
++# Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++# All rights reserved.
++#
++# Redistribution and use in source and binary forms, with or without
++# modification, are permitted provided that the following conditions are met:
++#     * Redistributions of source code must retain the above copyright
++#       notice, this list of conditions and the following disclaimer.
++#     * Redistributions in binary form must reproduce the above copyright
++#       notice, this list of conditions and the following disclaimer in the
++#       documentation and/or other materials provided with the distribution.
++#     * Neither the name of the copyright holder nor the
++#       names of its contributors may be used to endorse or promote products
++#       derived from this software without specific prior written permission.
++#
++# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++#
++# Written by Peter de Rivaz, John Cox
++
++
++
++# Inter pred asm
++#
++# Logic here should be good to 14 bits without modification
++# but only 8 & 10 are currently instantiated & tested
++# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
++# in _p00 & _b00
++
++# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
++# the warning that we are using rotation & ra/rb registers. r0..3 can be
++# rotated through all 16 elems ra regs can only be rotated through their
++# local 4.  As it happens this is what is wanted here as we do not want the
++# constants from the other half of the calc.
++
++# Number limits in P/B calculation
++#
++# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
++# we offset our intermediates s.t. they always end up +ve before the next
++# multiply (may be -ve whilst summing but that doesn't matter).
++#
++# Range calc for up to 14 bits (Y-B pred):
++#
++# denom: [0, 7]
++# bmax = (1 << bits) - 1
++# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
++#
++# wt_mul: [-128, 255]
++# wt_off = off * 2 + 1: [-bmax, bmax]
++#
++# pel: [0, bmax]
++# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
++# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
++# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
++# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
++# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
++#  [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
++#
++# This all looks good and is mostly bit depth independant - and as we manage
++# to do unsigned multiplies everywhere (now) this should be good for any bit
++# depth up to 14 (we could probably do 16 - but that requires a few tweaks
++# to the shifts we don't currently have logic for)
++
++# PREREAD is the number of requests that we have sitting in the TMU request
++# queue.
++#
++# There are 8 slots availible in the TMU request Q for tm0s requests, but
++# only 4 output FIFO entries and overflow is bad (corruption or crash)
++# (If threaded then only 2 out FIFO entries, but we aren't.)
++# In s/w we are effectively limited to the min vertical read which is >= 4
++# so output FIFO is the limit.
++#
++# As the test for read-next is is the main part of the Luma loop (rather than
++# the preload FIFO part) we are limited to min_luma_height - 1
++# Min_luma_height is 4 so we can only have a preload of 3
++# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
++# in chroma without abandoning preload pretty much entirely (which would be bad)
++#
++# Timing tests vs preload of 4 suggests this doesn't hurt us much
++# Could have preread 4 for Chroma but when tested it didn't help
++
++.set PREREAD,                      3
++
++# Offset added (effectively) at the exit of the H FIR filter
++# This is enough to force the result +ve
++# Is good if it is a power of 2 as that allows for >> without loss
++#
++# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
++# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
++# Round up to next power of 2
++
++.set FIR_OFFSET,                   0x4000
++
++# Block heights - 8 & 16 are the only numbers we currently support
++
++.set C_BLK_HEIGHT_8,               16
++.set C_BLK_HEIGHT_16,              8
++.set Y_BLK_HEIGHT_8,               16
++.set Y_BLK_HEIGHT_16,              8
++
++# QPU counts - depend on block size
++# If we have a 2-byte format & block_size > 8 then can only afford
++# 8 QPUs
++# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
++
++.set N_QPU_8,                      12
++.set N_QPU_16,                     12
++
++# Value to add to the weight multiplier to convert it into an unsigned value
++# Should be power of two for convienience
++
++.set LOG2_MUL_ADD,                 14
++.set MUL_ADD,                      (1 << LOG2_MUL_ADD)
++
++# Fixed denom (max that it can be set to)
++.set DENOM,                        7
++
++# register allocation
++#
++
++# ra0-3
++# Used as temp and may be loop filter coeffs (split into .8s)
++# or temp in loop. Check usage on an individual basis.
++
++# ra4-11
++# V FIFO / temp / free
++
++# -- free --                       ra12
++
++# -- free --                       ra13
++
++# -- free --                       ra14
++
++# -- free --                       ra15
++
++# uniform: width:height
++.set ra_width_height,              ra16
++.set ra_width,                     ra16.16b
++.set ra_height,                    ra16.16a
++
++# y:y2 same layout as y_y2_next so we can update both together
++.set ra_y_y2,                      ra17
++.set ra_y2,                        ra17.16a
++.set ra_y,                         ra17.16b
++
++# uniform: L1 weight (U on left, V on right)
++# Only used in Y B
++.set ra_wt_off_mul_l1,             ra18
++.set ra_wt_off_l1,                 ra18.16b
++.set ra_wt_mul_l1,                 ra18.16a
++
++# y_next:y2_next same layout as y_y2 so we can update both together
++.set ra_y_y2_next,                 ra19
++.set ra_y_next,                    ra19.16b
++.set ra_y2_next,                   ra19.16a
++
++# Setup: consts - subdivide a single register
++.set ra_kff800100,                 ra20
++.set ra_k256,                      ra20.16a
++.set ra_k0,                        ra20.8a
++.set ra_k1,                        ra20.8b
++.set ra_k128,                      ra20.8c
++.set ra_k255,                      ra20.8d
++
++# Loop: xshifts
++.set ra_xshift,                    ra21.16a
++.set ra_xshift_next,               ra21.16b
++
++# Loop var: L0 weight (U on left, V on right)
++# _off_ is not used in loop as we want to modify it before use
++.set ra_wt_off_mul_l0,             ra22
++.set ra_wt_mul_l0,                 ra22.16a
++.set ra_wt_off_l0,                 ra22.16b
++
++# Max pel value (for 8 bit we can get away with sat ops but not 9+)
++# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
++#   2nd byte   but as the source should never be > 3 there 0x3ff should do
++.set ra_blk_height_pmax,           ra23
++.set ra_pmax,                      ra23.16a
++.set ra_blk_height,                ra23.8c
++# --free --                        ra23.8d
++
++# Loop:  src frame base (L0)
++.set ra_base,                      ra24
++
++# Misc  offsets
++.set ra_fir_off_val_wt_den_p7,     ra25
++.set ra_wt_den_p7,                 ra25.8a
++# -- free --                       ra25.8b
++.set ra_fir_off_val,               ra25.16b
++
++# As it happens these constants are the same
++.if FIR_OFFSET == MUL_ADD
++# Weight multiplier unsigned add
++.set ra_kmul_add,                  ra_fir_off_val
++.else
++.error "FIR_OFFSET != MUL_ADD: Need new register & init"
++.endif
++
++# Loop: next src frame base (L0)
++.set ra_base_next,                 ra26
++
++# Loop: height<<23 + width<<16 + vdw_setup_0
++.set ra_dma0,                      ra27
++
++# Loop: destination address
++.set ra_dest,                      ra28
++
++# Setup: Dup of rb_ef
++# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
++# (top bits are ignored by mul24)
++.set ra_ef,                        ra29
++
++# Use an even numbered register as a link register to avoid corrupting flags
++.set ra_link,                      ra30
++
++# -- free --                       ra31
++
++.set rb_xshift2,                   rb0
++.set rb_xshift2_next,              rb1
++
++# C:  (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
++.set rb_elem_x,                    rb2
++
++# El Flags
++# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
++# Duped into ra_ef as sometimes that is easier to use
++.set rb_ef,                        rb3
++
++# rb4-11
++# Loop: V filter FIFO or V filter coeff
++
++# Loop var: offset to add before shift (round + weighting offsets)
++# Exact value varies by loop
++.set rb_wt_off,                    rb12
++
++# -- free --                       rb13
++
++# -- free --                       rb14
++
++# Loop: src frame base (L1)
++.set rb_base2,                     rb15
++
++# Line pitch (128 for sand128)
++.set rb_pitch,                     rb16
++
++# Loop count - 2 (set up TMU for next xfer)
++.set rb_i_tmu,                     rb17
++
++# Loop count for min(height, 16)
++# Y will reset & loop again if height > 16
++.set rb_lcount,                    rb18
++
++# frame_base2_next
++.set rb_base2_next,                rb19
++
++# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
++# offset to the slice
++.set rb_xpitch,                    rb20
++
++# These 3 consts each save 1 instruction in Y loop setup
++# so whilst they are worthwhile they should be the 1st to die if we need
++# another b reg
++.set rb_y_coeffs_2,                rb21                         # 0x050b0a00
++.set rb_y_coeffs_3,                rb22                         # 0x11283a40
++.set rb_y_coeffs_5,                rb23                         # 0x0a0b0500
++
++# Setup: 0xff (8-bit) / 0xffff (9+ bit)
++.set rb_pmask,                     rb24
++
++# vdw_setup_1(dst_pitch)
++.set rb_dma1_base,                 rb25
++
++# Setup: pic width - 1
++# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
++.set rb_max_x,                     rb26
++
++# vdw_setup_0 (depends on QPU number)
++.set rb_dma0_base,                 rb27
++
++# Setup: vw_setup value to reset VPM write pointer
++.set rb_vpm_init,                  rb28
++
++# Loop: vdw_setup_1(dst_pitch-width) = stride
++.set rb_dma1,                      rb29
++
++# Setup: pic_height - 1
++.set rb_max_y,                     rb30
++
++# Setup: FIR H offset
++.set rb_fir_off_h,                 rb31
++
++
++# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
++.set i_shift16,                    -16
++.set i_shift21,                    -11
++.set i_shift23,                     -9
++.set i_shift30,                     -2
++
++# Much of the setup code is common between Y & C
++# Macros that express this - obviously these can't be overlapped
++# so are probably unsuitable for loop code
++
++.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
++  mov r2, qpu_num
++.if v_bit_depth <= 8
++  # 8 bit version
++  asr r1, r2, 2
++  shl r1, r1, 6
++  and r0, r2, 3
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 4, h8p(0, 0))   # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
++  add r_vpm, r0, r1  # VPM 8bit storage
++
++  mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
++  shl r0, r0, 5
++
++.else
++  # 16 bit version
++  # Limited to 8 QPUs if blk height > 8
++  asr r1, r2, 1
++.if v_blk_height <= 8
++  shl r1, r1, 4
++.else
++  shl r1, r1, 5
++.endif
++  and r0, r2, 1
++  or  r0, r0, r1
++
++  mov r1, vpm_setup(0, 2, h16p(0, 0))   # 2 is stride - stride acts on ADDR
++  add r_vpm, r0, r1
++
++  # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
++  # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
++  mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))    # height,width added later
++  shl r0, r0, 6
++.endif
++  add r_dma, r0, r1  # DMA out
++.endm
++
++
++.macro m_setup_q0
++  srel -, 12
++.endm
++
++# Code start label
++::mc_start
++
++################################################################################
++# mc_setup_c
++#
++# typedef struct qpu_mc_pred_c_s_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint32_t pic_cw;            // C Width (== Y width / 2)
++#     uint32_t pic_ch;            // C Height (== Y Height / 2)
++#     uint32_t stride2;
++#     uint32_t stride1;
++#     uint32_t wdenom;
++#     int16_t y2;
++#     int16_t x2;
++#     uint32_t base2;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_s_t;
++
++.macro m_setup_c, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_pmask,           0xff
++.set v_blk_height,      C_BLK_HEIGHT_8
++.else
++.set v_x_shift,         2
++.set v_pmask,           0xffff
++.set v_blk_height,      C_BLK_HEIGHT_16
++.endif
++
++  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
++
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30      ; mov ra_base, unif             # ; ref_c_base
++
++# Read image dimensions
++  sub r0, unif, 1                                               # pic c width
++  shl rb_max_x, r0, v_x_shift                                   # rb_max_x in bytes
++  sub rb_max_y, unif, 1                                         # pic c height
++
++# load constants
++  mov ra_kff800100, 0xff800100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++
++# get source pitch
++  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # ; stride2
++  mov rb_pitch, unif                                            # stride1
++  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay] Merged with dst_stride shortly
++  add rb_dma1_base, r1, rb_pitch                                # vdw_setup_1
++
++  and r0, 1, elem_num
++  nop                           ; mul24 r0, r0, 5
++.if v_bit_depth <= 8
++  add rb_elem_x, r0, elem_num
++.else
++  add r0, r0, elem_num
++  add rb_elem_x, r0, r0
++.endif
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# ra_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift                                    # [rb_elem_x delay]
++  add r0, r0, rb_elem_x                                         # Add elem no to x to get X for this slice
++  max r0, r0, 0                 ; mov ra_y, ra0.16a             # ; stash Y
++  min r0, r0, rb_max_x
++
++# Get shift
++# Shift will always calculate as 0 for 9+ bit
++# Ideally we can optimize the shift out of the code in these cases but for now
++# it is tidier to leave it in
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.else
++  mov ra_xshift_next, 0         ; mov rb_xshift2_next, 0
++.endif
++
++# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra0, unif                 # ; next_x2_y2
++  add ra_base, ra_base, r0
++
++# Compute part of VPM to use for DMA output
++# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++# And again for L1, but only worrying about frame2 stuff
++
++# Compute base address for first and second access
++# ra_base ends up with t0s base
++# rb_base2 ends up with t1s base
++
++  shl r0, ra0.16b, v_x_shift
++  add r0, r0, rb_elem_x         ; mov ra_y2, ra0.16a            # Add QPU slice offset
++  max r0, r0, 0                 ; mov rb_base2, unif            # ref_c_base2
++  min r0, r0, rb_max_x
++
++# Get shift (already zero if 9+ bit so ignore)
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
++
++.if v_bit_depth <= 8
++  and r0, r0, -4
++.endif
++  sub r1, ra_k0, rb_pitch
++  and r1, r0, r1                ; mov r3, PREREAD
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov r2, ra_y2
++  add rb_base2, rb_base2, r0    ; mov r0, ra_y
++
++# Do preloads
++# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1          ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t1s, rb_base2, r1         ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  mov ra_link, unif                                             # link
++# touch registers to keep simulator happy (and fills in delay slots)
++  mov ra4, 0                    ; mov rb4, 0
++  bra -, ra_link
++  mov ra5, 0                    ; mov rb5, 0
++  mov ra6, 0                    ; mov rb6, 0
++  mov ra7, 0                    ; mov rb7, 0
++# >>> ra_link
++.endm
++
++::mc_setup_c_q0
++  m_setup_q0
++::mc_setup_c_qn
++  m_setup_c 8
++
++################################################################################
++#
++# mc_filter_c_p
++#
++# typedef struct qpu_mc_pred_c_p_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint16_t h;
++#     uint16_t w;
++#     uint32_t coeffs_x;
++#     uint32_t coeffs_y;
++#     uint32_t wo_u;
++#     uint32_t wo_v;
++#     uint32_t dst_addr_c;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_p_t;
++
++.macro m_filter_c_p, v_tmu, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_x_mul,           2
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_x_mul,           4
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in rb_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_tmu == 0
++.set vrx_xshift,        rb_xshift2              # b side more convienient
++.set vrx_xshift_next,   ra_xshift_next
++.set vra_y_next,        ra_y_next
++.set vrx_base_next,     ra_base_next
++.set vra_y,             ra_y
++.set vra_base,          ra_base
++.set vr_txs,            t0s
++.else
++.set vrx_xshift,        ra_xshift               # a side more convienient
++.set vrx_xshift_next,   rb_xshift2_next
++.set vra_y_next,        ra_y2_next
++.set vrx_base_next,     rb_base2_next
++.set vra_y,             ra_y2
++.set vra_base,          rb_base2
++.set vr_txs,            t1s
++.endif
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
++
++  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; base
++
++  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r0, r0          # r5 = 0
++  add r0, r0, rb_elem_x         ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
++  sub r1, r5, rb_pitch          ; mov ra0, unif                 # ; H filter coeffs
++  max r0, r0, r5                ; mov vrx_xshift, vrx_xshift_next
++  min r0, r0, rb_max_x          ; mov vra_y_next, ra2.16a
++
++.if v_bit_depth <= 8
++  shl vrx_xshift_next, r0, 3
++  and r0, r0, -4
++.endif
++  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=w*2 (we are working in pel pairs)  ** x*2 already calced!
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra3, unif                 # ; V filter coeffs
++  add vrx_base_next, r3, r0     ; mov r1, ra_height
++
++# set up VPM write
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
++  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
++
++# Misc final setup...
++
++  shl r0, r1, v_dma_h_shift     ; mov ra_dest, unif             # ; dst_addr
++  add r0, r0, r2                ; mov r2, ra_fir_off_val        # Combine width and height of destination area (r0=h<<8, r2=w*2)
++  shl r0, r0, v_dma_wh_shift    ; mov rb10, ra3.8c              # Shift into bits 16 upwards of the vdw_setup0 register
++  add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0          # ; r1=weight
++  shl r1, r1, i_wt_den_p5       ; mul24 r0, r2, ra_wt_mul_l0
++  sub rb_wt_off, r1, r0         ; mov r0, ra_kmul_add
++  add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4            # ; loop counter (V FIFO fill = 4)
++  mov rb11, ra3.8d              ; mov ra_link, unif             # ; Link
++
++# r5           = -4                     (loop counter)
++# ra_wt_mul_l0 = weight L0 + 128        (now unsigned)
++# rb_wt_off    = (offset * 2 + 1) << (wt_den + 5)
++# rb31         = FIR value offset
++
++# FIFO: rb4, ra5, rb6, ra7
++# Coeffs in ra3.8a, ra3.8b, rb10, rb11
++
++# We want (r0r1)
++# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
++# We fetch (after shift)
++#  C0  :  C3  :  C1  :  C4  :  C2  :  C5  : ...
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++.if v_tmu == 0
++  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu0
++  shr r2, r4, vrx_xshift        ; mov.ifz  r3, vra_y_next
++  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef      ; mov.ifz  vra_base, vrx_base_next
++.else
++  sub.setf -, r5, rb_i_tmu      ; mov rb4, ra5                  ; ldtmu1
++  shr r2, r4, vrx_xshift        ; mov.ifz  vra_base, vrx_base_next
++  shr r1, r2, v_v_shift         ; mov.ifnz r3, vra_y
++  add.setf -, rb_ef, rb_ef      ; mov.ifz  r3, vra_y_next       # [r1 << delay]
++.endif
++
++  add vra_y, r3, ra_k1          ; mov      r0, r1 << 15
++  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++  min r3, r3, rb_max_y          ; mov.ifnc r0, r2
++
++  and r1, r1, ra_pmax           ; mul24 r3, r3, rb_pitch
++.if v_tmu == 0
++  add vr_txs, vra_base, r3      ; v8min r0, r0, rb_pmask        # ; mask bytes
++.else
++  add vr_txs, vra_base, r3      ; v8min r0, r0, ra_pmax         # ; mask bytes
++.endif
++
++# apply horizontal filter
++# The filter coeffs for the two halves of this are the same (unlike in the
++# Y case) so it doesn't matter which ra0 we get them from
++# Also as the two halves are locked together we don't need to separate the 1st
++# r0 mul or the last r1 mul as they are valid for all QPUs
++
++  add r5rep, r5, 1              ; mul24      r3, ra0.8a,       r0
++  sub r2, rb_fir_off_h, r3      ; mul24      r3, ra0.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  add.setf -, r5, r5            ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
++# We would like to save the r5->r4 shift but we need a delay slot
++# for both r7 & r6 which we can't find anything to put in if we have
++# already multiplied r4 & r5!
++  brr.anyn -, r:1b
++  add r2, r2, r3                ; mul24 r0, ra7, rb10           # r6 post
++  mov ra5, rb6                  ; mul24 r1, rb6, ra3.8b         # r5 post
++  asr ra7, r2, v_bit_depth - 8  ; mov rb6, ra7
++# >>> .anyn 1b
++
++  add r1, r1, r0                ; mul24 r0, rb4, ra3.8a         # [ra7 delay]
++  sub r1, r1, r0                ; mul24 r0, ra7, rb11
++  sub r1, r1, r0
++
++  asr r1, r1, 6                 ; mov r3, ra_blk_height         # ; NxtLoop
++  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; v8subs r0, ra_height, r3      # ; NxtLoop
++  brr.anyn -, r:1b
++  asr r1, r1, i_wt_den_p6
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_p
++  m_filter_c_p 0, 8
++
++::mc_filter_c_p_l1
++  m_filter_c_p 1, 8
++
++################################################################################
++#
++# mc_filter_c_b
++#
++# typedef struct qpu_mc_pred_c_b_s {
++#     int16_t y;
++#     int16_t x;
++#     uint32_t base;
++#     uint16_t h;
++#     uint16_t w;
++#     uint32_t coeffs_x1;
++#     uint32_t coeffs_y1;
++#     int16_t weight_u1;
++#     int16_t weight_v1;
++#     int16_t y2;
++#     int16_t x2;
++#     uint32_t base2;
++#     uint32_t coeffs_x2;
++#     uint32_t coeffs_y2;
++#     uint32_t wo_u2;
++#     uint32_t wo_v2;
++#     uint32_t dst_addr_c;
++#     uint32_t next_fn;
++# } qpu_mc_pred_c_b_t;
++
++.macro m_filter_c_b, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         1
++.set v_v_shift,         8
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         2
++.set v_v_shift,         i_shift16
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++.set v_x_mul,           (1 << v_x_shift)
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++# per-channel shifts were calculated on the *previous* invocation
++
++# get base addresses and per-channel shifts for *next* invocation
++  mov vw_setup, rb_vpm_init     ; mov ra2, unif                 # ; x_y
++
++  add.setf -, rb_ef, rb_ef      ; mov r3, unif                  # [ra2 delay] ; r3=base
++
++  shl r0, ra2.16b, v_x_shift    ; v8subs r5rep, r1, r1          # x ; r5=0
++  add r0, r0, rb_elem_x         ; mov ra_y_next, ra2.16a
++  sub r1, r5, rb_pitch          ; mov ra_width_height, unif     # r1=pitch2 mask ; width_height
++  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x          ; mov ra0, unif                 # ; L0 H filter coeffs
++
++.if v_bit_depth <= 8
++  shl ra_xshift_next, r0, 3
++.endif
++
++  and r0, r0, -4                ; mov ra2, unif                 # ; L0 V filter coeffs
++  and r1, r0, r1                ; mul24 r2, ra_width, v_x_mul   # r2=x*2 (we are working in pel pairs)
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov r1, ra_height             # Add stripe offsets ; r1=height
++  add ra_base_next, r3, r0      ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
++
++# set up VPM write
++
++  sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif    # Compute vdw_setup1(dst_pitch-width) ; U weight
++  add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
++  add rb_lcount, r1, (3-4)      ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
++
++  shl r0, r1, v_dma_h_shift     ; mov ra3, unif                 # ; x2_y2
++  add r0, r0, r2                ; mov r3, unif                  # [ra3 delay] ; base
++  shl r0, r0, v_dma_wh_shift    ; mov ra_y2_next, ra3.16a       # Shift into bits 16 upwards of the vdw_setup0 register
++  add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b               # r0=x
++
++# L1 - uniform layout could possibly be optimized
++
++  shl r0, r0, v_x_shift         ; mov ra1, unif                 # r0=x<<shift ; L1 H filter coeffs
++  add r0, r0, rb_elem_x         ; mov ra3, unif                 # ; L1 V filter coeffs
++  sub r1, r5, rb_pitch          ; mov ra_wt_off_mul_l1, unif    # [ra3 delay] r1=pitch2 mask ; U offset/weight
++  max r0, r0, r5                ; mov ra9, rb_max_y
++  min r0, r0, rb_max_x          ; mov r2, ra_kmul_add
++
++.if v_bit_depth <= 8
++  shl rb_xshift2_next, r0, 3
++.endif
++
++  and r0, r0, -4                ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
++  and r1, r0, r1                ; mov r5rep, -4
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra_dest, unif             #  Add stripe offsets ; dst_addr
++  add rb_base2_next, r3, r0     ; mov r0, ra_fir_off_val
++
++  add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
++  add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
++  add r0, r0, r1                ; mov r1, ra_wt_off_l1          # ; L0 off unset
++  shl r1, r1, i_wt_den_p6       ; mov rb11, ra3.8d
++  sub rb_wt_off, r1, r0         ; mov ra_link, unif             # ; link
++
++  mov ra10, rb_xshift2          ; mov rb7,  ra2.8d
++
++# r5        loop counter (-4)
++# ra0       H coeffs L0
++# ra1       H coeffs L1
++# ra2       V coeffs L0
++# ra3       V coeffs L1
++# ra9       rb_max_y alias
++# ra10      rb_xshift2 alias
++
++:1
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu0
++  shr r2, r4, ra_xshift         ; mov.ifz rb_base2, rb_base2_next
++  shr r1, r2, v_v_shift         ; mov.ifz ra_y_y2, ra_y_y2_next
++  add.setf -, rb_ef, rb_ef      ; mov.ifz ra_base, ra_base_next # [ra_y delay]
++  add ra_y, 1, ra_y             ; mov r3, ra_y
++
++  max r3, r3, ra_k0             ; mov      r0, r1 << 15
++  min r3, r3, ra9               ; mov.ifnc r1, r2 << 1
++
++  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++  add t0s, ra_base, r3          ; v8min r0, r0, rb_pmask        # ; masks bytes
++
++# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
++
++  and r1, r1, rb_pmask          ; mul24      r2, ra0.8a,       r0
++  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra0.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra0.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3                ; mul24      r3, ra0.8c << 4,  r0 << 4  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
++
++  add r0, r2, r3                ; mul24 ra4, rb5, ra2.8a        ; ldtmu1
++
++  shr r2, r4, ra10              ; mov rb5, rb6
++  shr r1, r2, v_v_shift         ; mov r3, ra_y2
++  shr ra7, r0, v_bit_depth - 8  ; mov rb6, ra7                  # [r1 << delay]
++
++  add ra_y2, r3, ra_k1          ; mov      r0, r1 << 15
++  max r3, r3, ra_k0             ; mov.ifnc r1, r2 << 1
++  min r3, r3, rb_max_y          ; v8min r1, r1, ra_pmax
++
++  mov.ifnc r0, r2               ; mul24 r3, r3, rb_pitch
++  add t1s, rb_base2, r3         ; v8min r0, r0, ra_pmax         # ; masks bytes
++
++# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
++
++  add r5rep, r5, 1              ; mul24      r2, ra1.8a,       r0
++  sub r2, rb_fir_off_h, r2      ; mul24      r3, ra1.8d,       r1
++  sub r2, r2, r3                ; mul24      r3, ra1.8b << 2,  r0 << 2  @ "mul_used", 0
++  nop                           ; mul24.ifn  r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
++  add r2, r3, r2                ; mul24      r3, ra1.8c << 4,  r0 << 4  @ "mul_used", 0
++  add.setf -, r5, r5            ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  add r2, r2, r3                ; mul24 r0, rb9,  ra3.8a
++  mov rb9, rb10                 ; mul24 r1, rb10, ra3.8b
++  shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++# >>> .anyn 1b
++
++  sub r2, r1, r0                ; mul24 r1, rb5,  ra2.8b        # L1 ; L0
++  sub.setf -, r5, rb_lcount     ; mov r0, ra4
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  add r1, r1, r0                ; mul24 r0, ra7,  rb7
++
++  sub r1, r1, r0                ; mul24 r0, rb10, ra3.8c        # L1
++  add r2, r2, r0                ; mul24 r0, ra11, rb11          # L1
++  sub r2, r2, r0
++
++  shr r1, r1, 6
++  shr r2, r2, 6                 ; mul24 r0, r1, ra_wt_mul_l0
++  add r2, r2, r1                ; mul24 r1, r2, ra_wt_mul_l1
++  add r1, r1, r0                ; mul24 r2, r2, ra_kmul_add
++  sub r1, r1, r2                ; mov r3, ra_blk_height         # ; NxtLoop
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3      # ; NxtLoop
++
++  brr.anyn -, r:1b
++  asr r1, r1, ra_wt_den_p7
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++# >>> .anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_c_b
++  m_filter_c_b 8
++
++################################################################################
++# Exit code used by both Luma & Chroma so place between them to avoid I-cache
++# conflicts
++
++.macro m_exit_drain
++.if PREREAD == 2
++# Special case 2 as loop is wasteful
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  nop                   ; nop           ; ldtmu0
++  mov -, vw_wait        ; nop           ; ldtmu1
++.else
++  mov.setf r3, PREREAD - 1
++:1
++  brr.anynz -, r:1b
++  nop                   ; nop           ; ldtmu0
++  nop                   ; nop           ; ldtmu1
++  sub.setf r3, r3, 1
++ # >>>
++  mov  -, vw_wait
++.endif
++.endm
++
++# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
++# All qpus start at the beginning and after that (group - 1) must have finished
++# before (group) can start
++#
++# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
++# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
++# lockup otherwise)
++#
++# There is some, currently ill defined, potential lockup if we have the VDM active
++# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
++#
++# The code stalled when I had many waiters on a single sem so we have a
++# "ripple" of srels to restart.  Unsure why, may have been bug, but this works
++# and we currently have both the memory & sems to support it.
++.macro m_sync_q, n_qpu, n_quads
++# Do not generate code for qpu >= quads * 4 -  fns should never be called
++.if n_qpu < n_quads * 4
++  mov ra_link, unif     # Can only branch to an a reg (not r0)
++  mov -, vw_wait        # [ra_link delay]
++
++.set n_sem_sync, n_qpu - (n_qpu % 4)
++.set n_sem_in, n_qpu
++.set n_sem_out, n_qpu + 1
++
++.if n_qpu % 4 == 0
++
++.set n_sem_quad_in,  12 + n_qpu / 4
++.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
++
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  sacq -, n_sem_sync
++  bra -, ra_link
++  sacq -, n_sem_quad_in
++  srel -, n_sem_out
++  srel -, n_sem_quad_out
++
++.else
++  bra -, ra_link
++  srel -, n_sem_sync
++  sacq -, n_sem_in
++.if n_sem_out % 4 != 0
++  srel -, n_sem_out
++.else
++  nop
++.endif
++.endif
++.endif
++.endm
++
++.set v_quads8, N_QPU_8 / 4
++
++::mc_sync_q0
++  m_sync_q 0, v_quads8
++::mc_sync_q1
++  m_sync_q 1, v_quads8
++::mc_sync_q2
++  m_sync_q 2, v_quads8
++::mc_sync_q3
++  m_sync_q 3, v_quads8
++::mc_sync_q4
++  m_sync_q 4, v_quads8
++::mc_sync_q5
++  m_sync_q 5, v_quads8
++::mc_sync_q6
++  m_sync_q 6, v_quads8
++::mc_sync_q7
++  m_sync_q 7, v_quads8
++::mc_sync_q8
++  m_sync_q 8, v_quads8
++::mc_sync_q9
++  m_sync_q 9, v_quads8
++::mc_sync_q10
++  m_sync_q 10, v_quads8
++::mc_sync_q11
++  m_sync_q 11, v_quads8
++
++# mc_exit()
++# Chroma & Luma the same now
++
++.macro m_exit_qn
++  m_exit_drain
++  nop                   ; nop           ; thrend
++  nop
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_qn
++::mc_exit_y_qn
++  m_exit_qn
++
++
++
++# mc_interrupt_exit12()
++
++.macro m_exit_q0
++  m_exit_drain
++  sacq -, 12
++  nop                   ; nop           ; thrend
++  mov interrupt, 1
++  nop
++# >>> thrend <<<
++.endm
++
++::mc_exit_c_q0
++::mc_exit_y_q0
++  m_exit_q0
++
++# LUMA CODE
++
++# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
++# For P frames we make the second x,y coordinates offset by +8
++
++
++################################################################################
++# mc_setup
++#
++# typedef struct qpu_mc_pred_y_s_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t pic_h;
++#    uint16_t pic_w;
++#    uint32_t stride2;
++#    uint32_t stride1;
++#    uint32_t wdenom;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_s_t;
++
++.macro m_setup_y, v_bit_depth
++
++# Cannot use mul24 on x as x might be -ve, so must use shift
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_pmask,           0xff
++.set v_blk_height,      Y_BLK_HEIGHT_8
++.else
++.set v_x_shift,         1
++.set v_pmask,           0xffff
++.set v_blk_height,      Y_BLK_HEIGHT_16
++.endif
++
++
++  # Need to save these because we need to know the frame dimensions before computing texture coordinates
++  mov tmurs, 1                  ; mov ra0, unif                 # No TMU swap ; x_y
++  mov ra9, unif                                                 # ref_y_base
++  mov ra1, unif                                                 # x2_y2
++
++
++# load constants
++  mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
++  shl rb_ef, r0, i_shift30      ; mov ra11, unif                # ; ref_y2_base
++
++  mov ra_kff800100, 0xff800100
++  mov rb_pmask, v_pmask
++  mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
++  mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
++  mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
++  mov rb_y_coeffs_2, 0x050b0a00
++  mov rb_y_coeffs_3, 0x11283a40
++  mov rb_y_coeffs_5, 0x0a0b0500
++
++# Compute part of VPM to use
++
++# Read image dimensions
++  mov ra3, unif                                                 # width_height
++  mov ra_ef, rb_ef              ; mov rb_xpitch, unif           # [ra3 delay] ; stride2
++.if v_x_shift == 0
++  sub rb_max_x, ra3.16b, 1
++.else
++  sub r0, ra3.16b, 1
++  shl rb_max_x, r0, v_x_shift
++.endif
++  sub rb_max_y, ra3.16a, 1
++  mov r3, elem_num              ; mov rb_pitch, unif            # stride1
++
++# get destination pitch
++  mov r1, vdw_setup_1(0)                                        # [rb_pitch delay]
++  or  rb_dma1_base, r1, rb_pitch
++
++# Compute base address for first and second access
++  add r0, ra0.16b, r3                                           # Load x + elem_num
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++
++# X is byte offset - we can only load words - mask
++
++  and r0, r0, -4                ; v8subs r2, r2, r2
++  sub r2, r2, rb_pitch
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                                # Add stripe offsets
++  add ra_base, ra9, r0
++
++  # r3 still contains elem_num
++  add r0, ra1.16b, r3                                           # Load x
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, 0
++  min r0, r0, rb_max_x
++  shl rb_xshift2_next, r0, 3                                    # Compute shifts
++
++  # r2 still contains mask
++  and r0, r0, -4
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                                                # Add stripe offsets
++  add rb_base2, ra11, r0
++
++# Do preloads
++  nop                           ; mov r0, ra0.16a               # ; r0 = y
++  mov r3, PREREAD               ; mov r2, ra1.16a               # ; r2 = y2
++
++:1
++  sub.setf r3, r3, 1
++  max r1, r0, 0
++  min r1, r1, rb_max_y
++  add r0, r0, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t0s, ra_base, r1          ; mov ra_y, r0
++
++  max r1, r2, 0
++  brr.anynz -, r:1b
++  min r1, r1, rb_max_y
++  add r2, r2, ra_k1             ; mul24 r1, r1, rb_pitch
++  add t1s, rb_base2, r1         ; mov ra_y2, r2
++# >>> .anynz 1b
++
++  m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
++
++  mov ra_link, unif                                             # Next fn
++
++# touch vertical context to keep simulator happy
++  mov ra8,  0                   ; mov rb8,  0                   # [ra_link delay]
++  bra -, ra_link
++  mov ra9,  0                   ; mov rb9,  0
++  mov ra10, 0                   ; mov rb10, 0
++  mov ra11, 0                   ; mov rb11, 0
++# >>> ra_link
++.endm
++
++::mc_setup_y_q0
++  m_setup_q0
++::mc_setup_y_qn
++  m_setup_y 8
++
++################################################################################
++#
++# Start of per-block setup code
++# P and B blocks share the same setup code to save on Icache space
++
++# get base addresses and per-channel shifts for *next* invocation
++# per-channel shifts were calculated on the *previous* invocation
++
++# 1st 3 instructions of per_block-setup in branch delay
++#
++# typedef struct qpu_mc_pred_y_p_s {
++#    qpu_mc_src_t next_src1;
++#    qpu_mc_src_t next_src2;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t mymx21;
++#    uint32_t wo1;
++#    uint32_t wo2;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p_t;
++#
++
++.macro m_luma_setup, v_bit_depth
++# Hack - QASM may well have have label pasting but I have no idea how...
++.if v_bit_depth == 8
++  brr ra_link, r:per_block_setup_8
++.elif v_bit_depth == 10
++  brr ra_link, r:per_block_setup_10
++.endif
++  mov ra0, unif                 ; mov r3, elem_num              # y_x ; elem_num has implicit unpack??
++  add.setf -, rb_ef, rb_ef      ; v8subs r5rep, r2, r2          # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r3           ; mov rb_xshift2, rb_xshift2_next
++.endm
++
++.macro m_per_block_setup, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5                ; mov ra_xshift, ra_xshift_next
++  min r0, r0, rb_max_x
++
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++  and r0, r0, -4
++  sub r2, r5, rb_pitch          ; mov ra_base_next, unif        # ; src1.base
++  and r1, r0, r2                ; mov ra_y_next, ra0.16a
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra1, unif                 # Add stripe offsets ; src2.x_y
++  add ra_base_next, ra_base_next, r0                            # [ra1 delay]
++
++  add r0, ra1.16b, r3                                           # Load x2
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++  max r0, r0, r5                ; mov ra_y2_next, ra1.16a
++  min r0, r0, rb_max_x          ; mov rb_base2_next, unif       # ; src2.base
++  shl rb_xshift2_next, r0, 3                                    # Compute shifts
++  and r0, r0, -4                ; mov ra_width_height, unif     # ; width_height
++  and r1, r0, r2                ; mov vw_setup, rb_vpm_init     # ; set up VPM write
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mul24 r1, ra_width, v_x_mul   # Add stripe offsets ; r1 = x in bytes
++  add rb_base2_next, rb_base2_next, r0
++
++# get width,height of block (unif load above), r1 = width * pel_size
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height             # Compute vdw_setup1(dst_pitch-width)
++  add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
++  add rb_lcount, r0, (7-8)
++  shl r0, r0, v_dma_h_shift     ; mov r3, ra_kmul_add           # ; r3 return val
++  add r0, r0, r1                                                # Combine width and height of destination area
++  shl r0, r0, v_dma_wh_shift    ; mov r2, ra_fir_off_val        # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
++  add ra_dma0, r0, rb_dma0_base ; mov r0, unif                  # ; Packed filter offsets
++
++# get filter coefficients and discard unused B frame values
++  shl.ifnn r0, r0, i_shift16    ; mov ra_wt_off_mul_l0, unif    #  Pick half to use ; L0 offset/weight
++  shl ra8, r0, 3                ; mov rb5, ra_k255
++
++# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
++
++# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
++# but I can't see a way of doing that that is cheap enough to be worth it
++
++# Picked out in a slightly random order to space out uniform loads
++
++  # 1
++  mov r1, 0x01040400            # [ra8 delay]
++  ror ra2.8b, r1, ra8.8d
++  ror ra0.8b, r1, ra8.8c
++  # 2
++  ror ra2.8c, rb_y_coeffs_2, ra8.8d
++  ror ra0.8c, rb_y_coeffs_2, ra8.8c
++  # 0
++  mov r1,0x00010100             # -ve  [ra8 delay]
++  ror r0, r1, ra8.8d            ; mov ra_wt_off_mul_l1, unif    # ; L1 Wt/Offset
++  ror ra0.8a, r1, ra8.8c        ; v8min rb4, r0, rb5
++  # 7
++  shl r1, r1, 8                 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
++  ror r0, r1, ra8.8d            ; mov ra_dest, unif             # ; Destination address
++  ror ra1.8d, r1, ra8.8c        ; v8min rb11, r0, rb5
++  # 3
++  ror ra2.8d, rb_y_coeffs_3, ra8.8d
++  ror ra0.8d, rb_y_coeffs_3, ra8.8c
++  # 5
++  ror ra3.8b, rb_y_coeffs_5, ra8.8d
++  ror ra1.8b, rb_y_coeffs_5, ra8.8c
++  # 6
++  mov r1,0x04040100
++  ror ra3.8c, r1, ra8.8d
++  ror ra1.8c, r1, ra8.8c        ; mov r5rep, -8                 # ; r5 return val
++
++  bra -, ra_link
++  # 4
++  mov r1,0x3a281100
++  ror r0, r1, ra8.8d            ; mov ra_link, unif             # ; link - load after we've used its previous val
++  ror ra1.8a, r1, ra8.8c        ; v8min rb8, r0, rb5
++# >>> branch ra_link
++
++# r5 = -8
++# r2 = fir_off_val
++# r3 = 128
++.endm
++
++:per_block_setup_8
++  m_per_block_setup 8
++
++
++
++################################################################################
++#
++# mc_filter_y_pxx
++#
++# Setup (& therefore uniform struct) shared with _bxx
++# Struct in m_luma_setup
++#
++# We can have 2 separate P reqs here as long as they mate to generate a
++# rectangular output block (i.e. h0 = h1, w0 = 8)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_pxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++  m_luma_setup v_bit_depth
++
++  shl r1, ra_wt_off_l0, i_wt_den_p5
++  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
++  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++
++# retrieve texture results and pick out bytes
++# then submit two more texture requests
++
++# This loop is identical to the B loop from here --->
++:1
++  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++
++  max r2, ra_y, 0               ; mov r1, 0
++  min r2, r2, rb_max_y          ; mov r3, ra_k1
++  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++  add t0s, ra_base, r2          ; mov rb5,  rb6
++  shr r0, r4, ra_xshift         ; mov rb6,  rb7
++
++  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
++  shr r1, r4, rb_xshift2        ; mov rb7, ra8
++  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++  add t1s, rb_base2, r2         ; mov ra8,  ra9
++
++# apply horizontal filter
++  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++  # >>> .anyn 1b (r5 + r5)
++
++  # apply vertical filter and write to VPM
++  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++  add r1, r1, r0                ; mul24 r0, ra8,  rb8
++  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++  add r1, r1, r0                ; mul24 r0, ra11, rb11
++# <--- to here
++  sub.setf -, r5, rb_i_tmu      ; mov r3, ra_blk_height                 # ; NxtLoop: r3 = block height
++  sub r1, r1, ra4               ; mov.ifz rb_base2, rb_base2_next
++  sub r1, r1, r0                ; mov.ifz ra_base, ra_base_next
++
++  asr r1, r1, 6                 ; mov.ifz ra_y_y2, ra_y_y2_next
++  sub.setf -, r5, rb_lcount     ; mul24 r0, r1, ra_wt_mul_l0
++  add r0, r0, rb_wt_off         ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; v8subs r0, ra_height, r3              # ; NxtLoop: r0 = remaining height (0 saturate)
++
++  brr.anyn -, r:1b
++  asr r1, r1, i_wt_den_p6
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch                # ; NxtLoop
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_pxx
++  m_filter_y_pxx 8
++
++
++################################################################################
++
++# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
++#
++# Setup (& therefore uniform struct) shared with _pxx
++# Struct in m_luma_setup
++#
++# l0 calc in els 0-7, L1 in 8-15
++# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
++#
++# At this point we have already issued PREREAD pairs of texture requests for the current block
++
++.macro m_filter_y_bxx, v_bit_depth
++
++# denom shift values
++.set i_wt_den_p5,                  (DENOM + 13 - v_bit_depth)
++.set i_wt_den_p6,                  (DENOM + 14 - v_bit_depth)
++
++  m_luma_setup v_bit_depth
++
++  shl r1, ra_wt_off_l0, i_wt_den_p6
++  add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
++  sub r1, r1, r0                ; mul24 r0, r2, ra_wt_mul_l1
++  sub rb_wt_off, r1, r0         ; mov ra_ef.8a, rb4
++
++# This loop is identical to the P loop from here --->
++:1
++  add.setf -, ra_ef, ra_ef      ; mul24 ra4, rb5, ra_ef
++
++  max r2, ra_y, 0               ; mov r1, 0
++  min r2, r2, rb_max_y          ; mov r3, ra_k1
++  add ra_y, ra_y, r3            ; mul24 r2, r2, rb_pitch        ; ldtmu0
++  add t0s, ra_base, r2          ; mov rb5,  rb6
++  shr r0, r4, ra_xshift         ; mov rb6,  rb7
++
++  max r2, ra_y2, r1             ; v8min r0, r0, rb_pmask        ; ldtmu1 # ; masks out all but wanted bytes
++  shr r1, r4, rb_xshift2        ; mov rb7, ra8
++  min r2, r2, rb_max_y          ; v8min r1, r1, ra_pmax
++  add ra_y2, ra_y2, r3          ; mul24 r2, r2, rb_pitch
++  add t1s, rb_base2, r2         ; mov ra8,  ra9
++
++# apply horizontal filter
++  add r5rep, r5, r3     ; mul24      r2, ra0.8a << 8,  r1 << 8  @ "mul_used", 0
++  mov r3, rb_fir_off_h  ; mul24.ifnn r2, ra0.8a,       r0
++  sub r2, r3, r2        ; mul24      r3, ra0.8b << 1,  r0 << 1  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8b << 9,  r1 << 9  @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra0.8c << 2,  r0 << 2  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra0.8d << 3,  r0 << 3  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8a << 4,  r0 << 4  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8b << 5,  r0 << 5  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
++  sub r2, r2, r3        ; mul24      r3, ra1.8c << 6,  r0 << 6  @ "mul_used", 0
++  nop                   ; mul24.ifn  r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
++  add r2, r2, r3        ; mul24      r3, ra1.8d << 7,  r0 << 7  @ "mul_used", 0
++  add.setf -, r5, r5    ; mul24.ifn  r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
++
++  brr.anyn -, r:1b
++  sub r2, r2, r3                ; mul24 r1, rb5,  ra2.8b
++  mov ra9,  rb10                ; mul24 r0, rb10, ra3.8b
++  asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
++  # >>> .anyn 1b (r5 + r5)
++
++  # apply vertical filter and write to VPM
++  # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
++
++  sub r1, r1, r0                ; mul24 r0, rb6,  ra2.8c
++  sub r1, r1, r0                ; mul24 r0, rb7,  ra2.8d
++  add r1, r1, r0                ; mul24 r0, ra8,  rb8
++  add r1, r1, r0                ; mul24 r0, rb10, ra3.8c
++  add r1, r1, r0                ; mul24 r0, ra11, rb11
++# <--- to here
++  sub r1, r1, ra4
++  sub r1, r1, r0                ; mov r2, rb_wt_off
++
++  asr r1, r1, 6
++  sub.setf -, r5, rb_i_tmu      ; mul24 r0, r1, ra_wt_mul_l0
++  mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
++  sub r1, r0, r1                ; mov.ifz ra_y_y2, ra_y_y2_next
++  sub.setf -, r5, rb_lcount     ; mov.ifz ra_base, ra_base_next
++  add r1, r1, r2                ; mov r0, r1 << 8
++  add r1, r1, r0                ; mov r3, ra_blk_height         # ; NxtLoop: r3 = block height
++
++  brr.anyn -, r:1b
++  asr r1, r1, ra_wt_den_p7      ; mul24 r2, r3, rb_pitch        # ; NxtLoop
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, 0                ; v8subs r0, ra_height, r3      # ; NxtLoop: r0 = remaining height (0 saturate)
++# >>> branch.anyn 1b (r5 - rb_lcount)
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed block_height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link (ra_height - remaining height)
++
++# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_bxx
++  m_filter_y_bxx 8
++
++################################################################################
++#
++# typedef struct qpu_mc_pred_y_p00_s {
++#    qpu_mc_src_t next_src1;
++#    uint16_t h;
++#    uint16_t w;
++#    uint32_t wo1;
++#    uint32_t dst_addr;
++#    uint32_t next_fn;
++# } qpu_mc_pred_y_p00_t;
++
++.macro m_filter_y_p00, v_bit_depth
++
++.if v_bit_depth <= 8
++.set v_x_shift,         0
++.set v_x_mul,           1
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     7
++.set v_dma_wh_shift,    i_shift16
++.else
++.set v_x_shift,         1
++.set v_x_mul,           2
++# Shifts to get width & height in the right place in ra_dma0
++.set v_dma_h_shift,     8
++.set v_dma_wh_shift,    15
++.endif
++
++  mov ra0, unif                 ; mov r0, elem_num              # y_x
++  mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5          # [ra0 delay] ; r5 = 0
++  add r0, ra0.16b, r0           ; mov ra_base_next, unif        # ; src1.base
++.if v_x_shift != 0
++  shl r0, r0, v_x_shift
++.endif
++
++  max r0, r0, r5                ; mov ra_y_next, ra0.16a        # ; width_height
++  min r0, r0, rb_max_x          ; mov ra_width_height, unif
++
++  shl ra_xshift_next, r0, 3                                     # Compute shifts
++  and r0, r0, -4
++  sub r2, r5, rb_pitch          ; mov ra_wt_off_mul_l0, unif    # ; weight_offset
++  and r1, r0, r2
++  xor r0, r0, r1                ; mul24 r1, r1, rb_xpitch
++  add r0, r0, r1                ; mov ra_dest, unif             # Add stripe offsets ; dest addr
++  add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init  # [ra_width delay] ; set up VPM write
++
++# get width,height of block (unif load above)
++# Compute vdw_setup1(dst_pitch-width)
++  shl r1, ra_width, v_x_shift
++  sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
++  sub rb_i_tmu, r0, PREREAD     ; v8min r0, r0, ra_blk_height
++  shl r0, r0, v_dma_h_shift     ; mov rb_lcount, r0
++  add r0, r0, r1                                                # Combine width and height of destination area
++  shl rb_wt_off, ra_wt_off_l0, DENOM + 7
++  shl r0, r0, v_dma_wh_shift    ; mov ra_link, unif             # Shift into bits 16 upwards of the vdw_setup0 register ; link
++  add ra_dma0, r0, rb_dma0_base
++
++:1
++  sub.setf -, r5, rb_i_tmu      ; v8adds r5rep, r5, ra_k1
++  nop                           ; mov.ifz ra_y, ra_y_next       ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; v8min r0, r0, rb_pmask
++
++  sub.setf -, r5, rb_lcount     ; mul24 r1, r0, ra_wt_mul_l0
++  shl r1, r1, 8                 ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, DENOM + 8
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3        ; mov vw_setup, rb_dma1 # Stride
++  sub r1, r0, r3        ; mov vw_addr, ra_dest  # start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_p00
++  m_filter_y_p00 8
++
++################################################################################
++
++.macro m_filter_y_b00, v_bit_depth
++# luma setup does a fair bit more than we need calculating filter coeffs
++# that we will never use but it saves I-cache to use it (also simple!)
++  m_luma_setup v_bit_depth
++
++# Fix up vals that were expecting a filter (somewhat icky)
++  mov r2, 1
++  add rb_i_tmu, rb_i_tmu, r2    ; mov r1, ra_wt_off_mul_l0      # Need in rX rather than raX for <<8 to do what we want
++  shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
++  nop                           ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
++
++:1
++  sub.setf -, r5, rb_i_tmu      ; nop                           ; ldtmu1
++  shr r1, r4, rb_xshift2        ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
++  shr r0, r4, ra_xshift         ; mov r3, rb_pitch
++
++  max r2, ra_y, 0  # y
++  min r2, r2, rb_max_y          ; mov.ifz ra_base, ra_base_next
++  add ra_y, ra_y, 1             ; mul24 r2, r2, r3
++  add t0s, ra_base, r2          ; mov.ifz rb_base2, rb_base2_next
++
++  max r2, ra_y2, 0
++  min r2, r2, rb_max_y
++  add ra_y2, ra_y2, 1           ; mul24 r2, r2, r3
++  add t1s, rb_base2, r2         ; v8min r0, r0, ra_pmax         # v8subs masks out all but bottom byte
++  and r1, r1, rb_pmask          ; mul24 r0, r0, ra_wt_mul_l0
++
++  sub.setf -, r5, rb_lcount     ; mul24 r1, r1, ra_wt_mul_l1
++  add r1, r0, r1                ; v8adds r5rep, r5, ra_k1
++
++  shl r1, r1, 8                 ; mov r3, ra_blk_height
++  add r1, r1, rb_wt_off         ; v8subs r0, ra_height, r3
++
++  brr.anyn -, r:1b
++  asr r1, r1, (DENOM + 9) - 32                                  # -32 to get valid shift immediate
++  min r1, r1, ra_pmax           ; mov -, vw_wait
++  max vpm, r1, ra_k0            ; mul24 r2, r3, rb_pitch
++# >>> branch.anyn 1b
++
++# r0 = remaining height (min 0)
++# r2 = r3 * rb_pitch
++# r3 = block_height
++
++# If looping again then we consumed 16 height last loop
++# rb_dma1 (stride) remains constant
++# rb_i_tmu remains const (based on total height)
++# recalc ra_dma0, rb_lcount based on new segment height
++
++  mov.setf ra_height, r0        ; mov vw_setup, ra_dma0         # ; VDW setup 0
++
++# DMA out
++  bra.anyz -, ra_link
++  min r0, r0, r3                ; mov vw_setup, rb_dma1         # ; Stride
++  sub r1, r0, r3                ; mov vw_addr, ra_dest          # ; start the VDW
++  shl r1, r1, i_shift23
++# >>> .anyz ra_link
++
++# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
++# We add to dma0 to reduce the number of output lines in the final block
++  brr -, r:1b
++  add rb_lcount, rb_lcount, r0
++  add ra_dma0, ra_dma0, r1
++  add ra_dest, ra_dest, r2      ; mov vw_setup, rb_vpm_init     # ; Reset our VDM write pointer
++# >>> 1b
++.endm
++
++::mc_filter_y_b00
++  m_filter_y_b00 8
++
++################################################################################
++################################################################################
++# 10 BIT
++
++::mc_setup_c10_q0
++  m_setup_q0
++::mc_setup_c10_qn
++  m_setup_c 10
++
++::mc_filter_c10_p
++  m_filter_c_p 0, 10
++
++::mc_filter_c10_p_l1
++  m_filter_c_p 1, 10
++
++
++::mc_filter_c10_b
++  m_filter_c_b 10
++
++# Even if these fns are the same as for other bit depths we want our own copy
++# to keep the code we are using in a single lump to avoid (direct map) cache
++# thrashing
++.set v_quads10, N_QPU_16 / 4
++
++::mc_sync10_q0
++  m_sync_q 0, v_quads10
++::mc_sync10_q1
++  m_sync_q 1, v_quads10
++::mc_sync10_q2
++  m_sync_q 2, v_quads10
++::mc_sync10_q3
++  m_sync_q 3, v_quads10
++::mc_sync10_q4
++  m_sync_q 4, v_quads10
++::mc_sync10_q5
++  m_sync_q 5, v_quads10
++::mc_sync10_q6
++  m_sync_q 6, v_quads10
++::mc_sync10_q7
++  m_sync_q 7, v_quads10
++::mc_sync10_q8
++  m_sync_q 8, v_quads10
++::mc_sync10_q9
++  m_sync_q 9, v_quads10
++::mc_sync10_q10
++  m_sync_q 10, v_quads10
++::mc_sync10_q11
++  m_sync_q 11, v_quads10
++
++::mc_exit_y10_q0
++::mc_exit_c10_q0
++  m_exit_q0
++
++::mc_exit_y10_qn
++::mc_exit_c10_qn
++  m_exit_qn
++
++::mc_setup_y10_q0
++  m_setup_q0
++::mc_setup_y10_qn
++  m_setup_y 10
++
++:per_block_setup_10
++  m_per_block_setup 10
++
++::mc_filter_y10_pxx
++  m_filter_y_pxx 10
++
++::mc_filter_y10_p00
++  m_filter_y_p00 10
++
++::mc_filter_y10_bxx
++  m_filter_y_bxx 10
++
++::mc_filter_y10_b00
++  m_filter_y_b00 10
++
++
++
++::mc_end
++# Do not add code here because mc_end must appear after all other code.
+diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
+new file mode 100644
+index 0000000000..89711d776b
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_cmd.h
+@@ -0,0 +1,165 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef RPI_SHADER_CMD_H
++#define RPI_SHADER_CMD_H
++
++#pragma pack(push, 4)
++
++#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
++// If mixed then we are just confused and get a lot of warnings....
++typedef const uint8_t * qpu_mc_src_addr_t;
++typedef uint8_t * qpu_mc_dst_addr_t;
++#else
++typedef uint32_t qpu_mc_src_addr_t;
++typedef uint32_t qpu_mc_dst_addr_t;
++#endif
++
++typedef struct qpu_mc_src_s
++{
++    int16_t y;
++    int16_t x;
++    qpu_mc_src_addr_t base;
++} qpu_mc_src_t;
++
++
++typedef struct qpu_mc_pred_c_p_s {
++    qpu_mc_src_t next_src;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x;
++    uint32_t coeffs_y;
++    uint32_t wo_u;
++    uint32_t wo_v;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_p_t;
++
++typedef struct qpu_mc_pred_c_b_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t coeffs_x1;
++    uint32_t coeffs_y1;
++    int16_t weight_u1;
++    int16_t weight_v1;
++    qpu_mc_src_t next_src2;
++    uint32_t coeffs_x2;
++    uint32_t coeffs_y2;
++    uint32_t wo_u2;
++    uint32_t wo_v2;
++    qpu_mc_dst_addr_t dst_addr_c;
++    uint32_t next_fn;
++} qpu_mc_pred_c_b_t;
++
++typedef struct qpu_mc_pred_c_s_s {
++    qpu_mc_src_t next_src1;
++    uint32_t pic_cw;            // C Width (== Y width / 2)
++    uint32_t pic_ch;            // C Height (== Y Height / 2)
++    uint32_t stride2;
++    uint32_t stride1;
++    qpu_mc_src_t next_src2;
++    uint32_t next_fn;
++} qpu_mc_pred_c_s_t;
++
++typedef struct qpu_mc_pred_c_s {
++    union {
++        qpu_mc_pred_c_p_t p;
++        qpu_mc_pred_c_b_t b;
++        qpu_mc_pred_c_s_t s;
++    };
++} qpu_mc_pred_c_t;
++
++
++typedef struct qpu_mc_pred_y_p_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t h;
++    uint16_t w;
++    uint32_t mymx21;
++    uint32_t wo1;
++    uint32_t wo2;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p_t;
++
++typedef struct qpu_mc_pred_y_p00_s {
++    qpu_mc_src_t next_src1;
++    uint16_t h;
++    uint16_t w;
++    uint32_t wo1;
++    qpu_mc_dst_addr_t dst_addr;
++    uint32_t next_fn;
++} qpu_mc_pred_y_p00_t;
++
++typedef struct qpu_mc_pred_y_s_s {
++    qpu_mc_src_t next_src1;
++    qpu_mc_src_t next_src2;
++    uint16_t pic_h;
++    uint16_t pic_w;
++    uint32_t stride2;
++    uint32_t stride1;
++    uint32_t next_fn;
++} qpu_mc_pred_y_s_t;
++
++typedef struct qpu_mc_pred_sync_s {
++    uint32_t next_fn;
++} qpu_mc_pred_sync_t;
++
++// Only a useful structure in that it allows us to return something other than a void *
++typedef struct qpu_mc_pred_y_s {
++    union {
++        qpu_mc_pred_y_p_t p;
++        qpu_mc_pred_y_p00_t p00;
++        qpu_mc_pred_y_s_t s;
++    };
++} qpu_mc_pred_y_t;
++
++typedef union qpu_mc_pred_cmd_u {
++    qpu_mc_pred_y_t y;
++    qpu_mc_pred_c_t c;
++    qpu_mc_pred_sync_t sync;
++} qpu_mc_pred_cmd_t;
++
++static void inline qpu_mc_link_set(qpu_mc_pred_cmd_t * const cmd, const uint32_t fn)
++{
++    // Link is last el of previous cmd
++    ((uint32_t *)cmd)[-1] = fn;
++}
++
++#define QPU_MC_PRED_N_Y8        12
++#define QPU_MC_PRED_N_C8        12
++
++#define QPU_MC_PRED_N_Y10       12
++#define QPU_MC_PRED_N_C10       12
++
++#define QPU_MC_DENOM            7
++
++#pragma pack(pop)
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
+new file mode 100644
+index 0000000000..77d8366eb8
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.c
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include "hevc.h"
++#include "rpi_hevcdec.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++
++typedef struct shader_track_s
++{
++    const union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    const struct qpu_mc_src_s *last_l0;
++    const struct qpu_mc_src_s *last_l1;
++    uint32_t width;  // pic_width * PW
++    uint32_t height;
++    uint32_t stride2;
++    uint32_t stride1;
++} shader_track_t;
++
++static int wtoidx(const unsigned int w)
++{
++    static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++    return pel_weight[w];
++}
++
++static const int fctom(uint32_t x)
++{
++    int rv;
++    // As it happens we can take the 2nd filter term & divide it by 8
++    // (dropping fractions) to get the fractional move
++    rv = 8 - ((x >> 11) & 0xf);
++    av_assert2(rv >= 0 && rv <= 7);
++    return rv;
++}
++
++static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
++{
++    return (x << shl) >> shr;
++}
++
++static inline int woff_p(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int woff_b(HEVCRpiContext *const s, int32_t x)
++{
++    return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
++}
++
++static inline int wweight(int32_t x)
++{
++    return ext(x, 16, 16);
++}
++
++
++#define PW 1
++#include "rpi_hevc_shader_template_fn.h"
++
++#undef PW
++#define PW 2
++#include "rpi_hevc_shader_template_fn.h"
++
+diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
+new file mode 100644
+index 0000000000..0fc5a45e9f
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template.h
+@@ -0,0 +1,49 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
++
++struct HEVCRpiContext;
++struct HEVCRpiInterPredEnv;
++
++void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
++                  const struct HEVCRpiInterPredEnv *const ipe_y,
++                  const struct HEVCRpiInterPredEnv *const ipe_c);
++
++void rpi_sand_dump8(const char * const name,
++                    const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++void rpi_sand_dump16(const char * const name,
++                     const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
++
++#endif
++
+diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
+new file mode 100644
+index 0000000000..10c163a4b9
+--- /dev/null
++++ b/libavcodec/rpi_hevc_shader_template_fn.h
+@@ -0,0 +1,502 @@
++/*
++Copyright (c) 2017 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++#define PATCH_STRIDE (16 * PW)
++
++static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
++        const pixel s = *(const pixel *)src;
++        pixel * d = (pixel *)dst;
++        for (unsigned int j = 0; j < w; j += PW) {
++            *d++ = s;
++        }
++    }
++}
++
++static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
++{
++    for (unsigned int i = 0; i != h; ++i, dst += stride) {
++        memcpy(dst, src, w);
++    }
++}
++
++static void FUNC(get_patch_y)(const shader_track_t * const st,
++                         uint8_t * dst, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > st->width) {
++        if (x >= st->width)
++            x = st->width - PW;
++        dr = (x + w) - st->width;
++        w = st->width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > st->height) {
++        if (y >= st->height)
++            y = st->height - 1;
++        db = (y + h) - st->height;
++        h = st->height - y;
++    }
++
++    dst += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++        FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
++    if (dr != 0)
++        FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
++    w += dl + dr;
++    dst -= dl;
++
++    if (dt != 0)
++        FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
++    if (db != 0)
++        FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
++}
++
++
++
++static void FUNC(get_patch_c)(const shader_track_t * const st,
++                         uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
++                         const qpu_mc_src_t *src,
++                         unsigned int _w, unsigned int _h)
++{
++    int x = src->x * PW;
++    int y = src->y;
++    int w = _w * PW;
++    int h = _h;
++    int dl = 0;
++    int dr = 0;
++    int dt = 0;
++    int db = 0;
++    const int width = st->width;
++    const int height = st->height;
++
++    if (x < 0) {
++        if (-x >= w)
++            x = PW - w;
++        dl = -x;
++        w += x;
++        x = 0;
++    }
++    if (x + w > width) {
++        if (x >= width)
++            x = width - PW;
++        dr = (x + w) - width;
++        w = width - x;
++    }
++
++    // Y
++    if (y < 0) {
++        if (-y >= h)
++            y = 1 - h;
++        dt = -y;
++        h += y;
++        y = 0;
++    }
++    if (y + h > height) {
++        if (y >= height)
++            y = height - 1;
++        db = (y + h) - height;
++        h = height - y;
++    }
++
++    dst_u += dl + dt * dst_stride;
++    dst_v += dl + dt * dst_stride;
++    FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
++
++    // Edge dup
++    if (dl != 0)
++    {
++        FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
++        FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
++    }
++    if (dr != 0)
++    {
++        FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
++        FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
++    }
++    w += dl + dr;
++    dst_u -= dl;
++    dst_v -= dl;
++
++    if (dt != 0)
++    {
++        FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
++        FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
++    }
++    if (db != 0)
++    {
++        FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
++        FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
++    }
++}
++
++// w, y, w, h in pixels
++// stride1, stride2 in bytes
++void FUNC(rpi_sand_dump)(const char * const name,
++                         const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
++{
++    const int mask = stride2 == 0 ? ~0 : stride1 - 1;
++
++    printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
++
++    if (is_c) {
++        x *= 2;
++        w *= 2;
++    }
++
++    for (int i = y; i != y + h; ++i) {
++        for (int j = x; j != x + w; ++j) {
++            const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
++            char sep = is_c && (j & 1) == 0 ? ':' : ' ';
++#if PW == 1
++            if (j < 0 || i < 0)
++                printf("..%c", sep);
++            else
++                printf("%02x%c", *(const pixel*)p, sep);
++#else
++            if (j < 0 || i < 0)
++                printf("...%c", sep);
++            else
++                printf("%03x%c", *(const pixel*)p, sep);
++#endif
++        }
++        printf("\n");
++    }
++}
++
++
++void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
++                  const HEVCRpiInterPredEnv *const ipe_y,
++                  const HEVCRpiInterPredEnv *const ipe_c)
++{
++    for (int c_idx = 0; c_idx < 2; ++c_idx)
++    {
++        const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
++        shader_track_t tracka[QPU_N_MAX] = {{NULL}};
++        unsigned int exit_n = 0;
++
++        if (ipe == NULL || !ipe->used) {
++            continue;
++        }
++
++        do {
++            for (unsigned int i = 0; i != ipe->n; ++i) {
++                const HEVCRpiInterPredQ * const q = ipe->q + i;
++                shader_track_t * const st = tracka + i;
++                const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
++
++                for (;;) {
++                    const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
++
++                    if (link == q->code_setup) {
++                        if (c_idx == 0) {
++                            // Luma
++                            const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
++
++                            st->height = c->pic_h;
++                            st->width = c->pic_w * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                        else {
++                            // Chroma
++                            const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
++
++                            st->height = c->pic_ch;
++                            st->width = c->pic_cw * PW;
++                            st->stride1 = c->stride1;
++                            st->stride2 = c->stride2;
++                            st->last_l0 = &c->next_src1;
++                            st->last_l1 = &c->next_src2;
++                            cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                        }
++                    }
++                    else if (link == s->qpu.y_pxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++                        const int w1 = FFMIN(c->w, 8);
++                        const int w2 = c->w - w1;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        if (w2 > 0) {
++                            FUNC(get_patch_y)(st,
++                                        patch_y2, PATCH_STRIDE,
++                                        st->last_l1,
++                                        16, c->h + 7);
++                        }
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
++                        if (w2 > 0) {
++                            s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                                (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                                c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
++                        }
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_bxx) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h + 7);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
++                           patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
++                           c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_p00) {
++                        const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h + 7);
++
++                        // wo[offset] = offset*2+1
++                        s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
++
++                        st->last_l0 = &c->next_src1;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.y_b00) {
++                        const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
++
++                        uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        av_assert0(c->w <= 16 && c->h <= 64);
++
++                        FUNC(get_patch_y)(st,
++                                    patch_y1, PATCH_STRIDE,
++                                    st->last_l0,
++                                    16, c->h);
++                        FUNC(get_patch_y)(st,
++                                    patch_y2, PATCH_STRIDE,
++                                    st->last_l1,
++                                    16, c->h);
++
++                        s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
++                           patch_y3, patch_y1, PATCH_STRIDE,
++                           c->h, 0, 0, c->w);
++
++                        s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
++                            (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
++                            c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
++                            0, woff_b(s, c->wo2), 0, 0, c->w);
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_pxx_l1) {
++                        const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
++                        const int mx = fctom(c->coeffs_x);
++                        const int my = fctom(c->coeffs_y);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
++                        s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
++                            patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                            c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l1 = &c->next_src;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == s->qpu.c_bxx) {
++                        const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
++                        const int mx1 = fctom(c->coeffs_x1);
++                        const int my1 = fctom(c->coeffs_y1);
++                        const int mx2 = fctom(c->coeffs_x2);
++                        const int my2 = fctom(c->coeffs_y2);
++
++                        uint8_t patch_u1[PATCH_STRIDE * 72];
++                        uint8_t patch_v1[PATCH_STRIDE * 72];
++                        uint8_t patch_u2[PATCH_STRIDE * 72];
++                        uint8_t patch_v2[PATCH_STRIDE * 72];
++                        uint8_t patch_u3[8 * 16 * PW];
++                        uint8_t patch_v3[8 * 16 * PW];
++                        uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
++                        uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
++
++                        FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
++                        FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
++
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++                        s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
++                           patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
++                           c->h, mx1, my1, c->w);
++
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
++                            c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
++                            0, woff_b(s, c->wo_u2), mx2, my2, c->w);
++                        s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
++                            patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
++                            c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
++                            0, woff_b(s, c->wo_v2), mx2, my2, c->w);
++
++                        FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
++
++                        st->last_l0 = &c->next_src1;
++                        st->last_l1 = &c->next_src2;
++                        cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
++                    }
++                    else if (link == q->code_sync) {
++                        cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
++                        break;
++                    }
++                    else if (link == q->code_exit) {
++                        // We expect exit to occur without other sync
++                        av_assert0(i == exit_n);
++                        ++exit_n;
++                        break;
++                    }
++                    else {
++                        av_assert0(0);
++                    }
++                }
++
++                st->qpu_mc_curr = cmd;
++            }
++        } while (exit_n == 0);
++    }
++}
++
++#undef FUNC
++#undef pixel
++
+diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
+new file mode 100644
+index 0000000000..3caef20137
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform.s
+@@ -0,0 +1,444 @@
++# ******************************************************************************
++# Argon Design Ltd.
++# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
++#
++# Module : HEVC
++# Author : Peter de Rivaz
++# ******************************************************************************
++
++# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
++# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
++.set USE_STACK, 0
++
++# Lines that fail to assemble start with #:
++# The script insert_magic_opcodes.sh inserts the machine code directly for these.
++# HEVC VPU Transform
++#
++# Transform matrix can be thought of as
++#   output row vector = input row vector * transMatrix2
++#
++# The even rows of the matrix are symmetric
++# The odd rows of the matrix are antisymmetric
++#
++# So only need to compute the first half of the results, then can compute the remainder with a butterfly
++#
++# EXAMPLE
++#   (a b c d) (1 2  2  1)
++#             (3 4 -4 -3)
++#             (5 6  6  5)
++#             (7 8 -8 -7)
++#
++#  x=(a c)(1 2) = 1a+5c 2a+6c
++#         (5 6)
++#
++#  y=(b d)(3 4) = 3b+7d 4b+8d
++#         (7 8)
++#
++#  u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
++#  v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
++#
++#  Final results are (u , v[::-1])
++#
++#
++#  For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
++#  Apply the even matrix first and stop before rounding
++#  Then apply the odd matrix in a full manner:
++#
++#   First step is to compute partial products with the first input (16 cycles)
++#   1a 3b 5c 7d   16x1 input coefficients produce 16x16 output
++#   2a 4b 6c 8d
++#   2a -4b 6c -8d
++#   1a -3b 5c -7d
++#
++#   Second step is to sum partial products into final position (8 cycles)
++#   1a+3b+5c+7d
++#   2a+4b+6c+8d
++#   2a-4b+6c-8d
++#   1a-3b+5c-7d
++#
++#   Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
++#
++#   For 16x16 no butterfly is required and can store final results in original location  (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
++#
++#   For 8x8 we could compute two in parallel.
++#
++#
++
++# Columns are transformed first
++#
++# Store top left half of transMatrix2 in
++# Store bottom left half of transMatrix2 in HX(32,32)
++#
++# For 16x16
++# HX(0:15,0) contains input data before transform
++# HY(0:15,0) contains 32bit output data after transform
++# HX(32,0) contains even rows of left half of transMatrix2
++# HX(32,32) contains odd rows of left half of transMatrix2
++# HY(48,0) contains partial products ready for summing
++#
++
++
++# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done
++# coeffs32
++# num32: number of 32x32 transforms
++# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
++#
++
++.equ TRANS_SHIFT, 20 - BIT_DEPTH
++.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
++.equ TRANS_ASL2, 16 - TRANS_SHIFT
++
++
++hevc_trans_16x16:
++  push r6-r15, lr # TODO cut down number of used registers
++  mov r14,r3 # coeffs32
++  mov r15,r4 # num32
++  mov r3, 16*2 # Stride of transMatrix2 in bytes
++  vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
++
++  add r0, 16*16*2 # For 32x32 transforms we also need this matrix
++  vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  # Now use r0 to describe which matrix we are working on.
++  # Allows us to prefetch the next block of coefficients for efficiency.
++  mov r0,0 # This describes the location where we read our coefficients from
++  mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
++  mov r7,16*16*2 # Total block size
++  mov r8,64*16 # Value used to swap from current to next VRF location
++  mov r4,64 # Constant used for rounding first pass
++  mov r5,TRANS_RND2 # Constant used for rounding second pass
++
++  sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++
++  add r11,sp,64 # Space for 32 bytes before, and rounding
++  lsr r11,5
++  lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
++
++  lsr r10, r2, 16 # Number of compressed blocks stored in top short
++  extu r2,16
++  # At start of block r0,r1 point to the current block (that has already been loaded)
++  # r0 VRF location of current block
++  # r1 address of current block
++  # r2 number of 16*16 transforms to do
++  # r3 Stride of coefficients (==32)
++  # r4 TRANS_RND1 (64)
++  # r5 TRANS_RND2
++  # r6 temporary used inside col_trans16
++  # r7 16*16*2 total bytes in block
++  # r8 64*16 VRF switch locations
++  # r9 temporary in unpack_coeff for index
++  # r10 number of 16x16 transforms using compression
++  # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
++  # r12 temporary counter in unpack_coeff
++  # r13
++  # r14 Save information for 32 bit transform (coeffs location)
++  # r15 Save information for 32 bit transform (number of transforms)
++  cmp r2,0
++  beq done16x16s
++block_loop:
++  # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
++  cmp r10,0
++  mov r6, r1
++  beq not_compressed
++  sub r10, 1
++  bl unpack16x16
++not_compressed:
++  #mov r6,r1 # DEBUG without compress
++  vldh HX(0++,0)+r0,(r6 += r3) REP 16
++  #eor r0,r8
++  #add r1,r7
++  # Prefetch the next block
++  #bl unpack16x16
++  #vldh HX(0++,0)+r0,(r6 += r3) REP 16
++  #vmov HX(0++,0)+r0,0 REP 16  # DEBUG
++  #eor r0,r8
++  #sub r1,r7
++
++  # Transform the current block
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16    # This should be saturating, but the instruction above does not assemble?
++  vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16    # For simplicity transpose this back to the original position
++
++  bl col_trans_16
++  vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16   # Now add on rounding, shift down by 7, and saturate
++  #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
++  vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16    # This should be saturating, but the instruction above does not assemble?  (Probably because it ends with ls which is interpreted as a condition flag)
++
++  # Save results - note there has been a transposition during the processing so we save columns
++  vsth VX(0,32++)+r0, (r1 += r3) REP 16
++
++  # Move onto next block
++  eor r0,r8
++  add r1,r7
++
++  addcmpbgt r2,-1,0,block_loop
++done16x16s:
++
++  add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
++  # Now go and do any 32x32 transforms
++  b hevc_trans_32x32
++
++  pop r6-r15, pc
++# This returns a value in r6 that says where to load the data from.
++# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
++unpack16x16:
++# Clear out destination
++  vmov HX(0,0)+r0,0
++  mov r6, r11
++  vsth HX(0,0)+r0,(r6 += r3) REP 16
++  mov r5, r1 # Moving pointer to input coefficients
++unpack_outer_loop:
++  # Loop until we find the end
++  vldh HX(0,0)+r0,(r5)  # TODO would prefetch help here while unpacking previous?
++  sub r6,r11,32
++  #add r6,pc,packed_data-$ # Packed data
++  vsth HX(0,0)+r0,(r6)  # Store into packed data
++  mov r12,0
++unpack_loop:
++  ld r4,(r6)
++  add r6,r6,4
++  lsr r9,r4,16 # r9 is destination value
++  cmp r4,0 # {value,index}
++  extu r4,8
++  beq done_unpack
++  sth r9,(r11, r4)
++  addcmpblt r12,1,8,unpack_loop
++#  # Read next 16
++  add r5,32
++  b unpack_outer_loop
++done_unpack:
++#  # Set new load location
++  mov r6, r11
++  #add r6,pc,unpacked_data-$
++#  # Restore constants
++  mov r4,64
++  mov r5,TRANS_RND2
++#  pop r6-r15, pc
++  b lr
++
++# r1,r2,r3 r7,r8 should be preserved
++# HX(0++,0)+r0 is the block to be transformed
++# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
++# Use HY(48,0) for intermediate results
++# r0 can be used, but should be returned to its original value at the end
++col_trans_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++col_trans_odd_16:
++  add r6,r0,16 # Final value for this loop
++col_trans_odd_16_loop:
++  # First compute partial products for a single column
++  vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
++  # Then sum up the results and place back
++  vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
++  addcmpblt r0,1,r6,col_trans_odd_16_loop
++  sub r0,16  # put r0 back to its original value
++  b lr
++
++# r1/r10 input pointer
++# r0,r4,r5,r6 free
++# r8/r9 output storage
++#
++# Store packed coefficients at r9-32
++# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
++unpack32x32:
++# Clear out destination
++  vmov HX(0,0),0
++  add r0, r9, 32*32*2 # Unpacked buffer
++  mov r4, 32
++  vsth HX(0,0),(r0 += r4) REP 64
++unpack_outer_loop32:
++  # Loop until we find the end
++  vldh HX(0,0),(r1)  # TODO would prefetch help here while unpacking previous?
++  sub r6,r9,32
++  #add r6,pc,packed_data-$ # Packed data
++  vsth HX(0,0),(r6)  # Store into packed data
++  mov r8,0
++unpack_loop32:
++  ld r4,(r6)
++  add r6,r6,4
++  lsr r5,r4,16 # r5 is destination value
++  cmp r4,0 # {value,index}
++  extu r4,10
++  beq done_unpack
++  sth r5,(r0, r4)
++  addcmpblt r8,1,8,unpack_loop32
++#  # Read next 16
++  add r1,32
++  b unpack_outer_loop32
++done_unpack32:
++  b lr
++# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
++# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
++# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
++# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
++#
++# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
++hevc_trans_32x32:
++  mov r1,r14 # coeffs
++  mov r2,r15 # num
++  lsr r15,r15,16 # Number that are packed
++  extu r2,16 # Total number
++
++  # Fetch odd transform matrix
++  #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
++  #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
++  #add r0, 16*16*2
++  #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
++
++  mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
++  mov r7, 16*16*2 # Total block size
++
++.if USE_STACK
++  # Stack base allocation
++  sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
++  # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
++  add r8,sp,63
++  lsr r8,5
++  lsl r8,5
++.else
++#:version r8
++  .half 0x00e8 #AUTOINSERTED
++  btst r8,16
++#:add r8,pc,intermediate_results-$
++  .half 0xbfe8
++  .half intermediate_results-($-2)
++  beq on_vpu1
++  add r8,r8,32*32*2*2+16*2 # Move to secondary storage
++on_vpu1:
++.endif
++  mov r9,r8  # Backup of the temporary storage
++  mov r10,r1 # Backup of the coefficient buffer
++
++  cmp r2,0
++  beq done32x32s
++block_loop32:
++
++  # Transform the first 16 columns
++  mov r1,r10  # Input Coefficient buffer
++  mov r8,r9   # Output temporary storage
++  # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
++  cmp r2,r15
++  bgt not_compressed_32
++  bl unpack32x32
++  add r1,r9,32*32*2   # Uncompressed into temporary storage
++  mov r8,r9           # Transform into here
++not_compressed_32:
++  # COLUMN TRANSFORM
++  mov r4, 64 # Constant used for rounding first pass
++  mov r5, 9 # left shift used for rounding first pass
++
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  # ROW TRANSFORM
++  mov r4, TRANS_RND2 # Constant used for rounding second pass
++  mov r5, TRANS_ASL2 # left shift used for rounding second pass
++
++  mov r1,r9  # Input temporary storage
++  mov r8,r10   # Output Coefficient buffer
++  bl trans32
++  # Transform the second 16 columns
++  add r8,32*16*2
++  add r1,32
++  bl trans32
++
++  add r10, 32*32*2 # move onto next block of coefficients
++  addcmpbgt r2,-1,0,block_loop32
++done32x32s:
++
++.if USE_STACK
++  add sp,sp,32*32*4+64# Restore stack
++.endif
++
++  pop r6-r15, pc
++
++trans32:
++  push lr
++  # We can no longer afford the VRF space to do prefetching when doing 32x32
++  # Fetch the even rows
++  vldh HX(0++,0),(r1 += r3) REP 16
++  # Fetch the odd rows
++  vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
++
++  # Transform the even rows using even matrix
++  mov r0, 0 # Even rows
++  bl col_trans_16
++
++  # Now transform the odd rows using odd matrix
++  mov r0, 64*16 # Odd rows
++  bl col_trans_odd_16
++
++  # Now apply butterfly to compute the first 16 results
++  vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  # 16bit results now in HX(48,32)
++  mov r0,r8
++  mov r6,32*2
++  vsth VX(48,32++),(r0+=r6) REP 16
++
++  # Now apply butterfly to compute the second 16 results (in reverse order)
++  vsub HY(63,0),HY(0 ,0),HY(16,0)
++  vsub HY(62,0),HY(1 ,0),HY(17,0)
++  vsub HY(61,0),HY(2 ,0),HY(18,0)
++  vsub HY(60,0),HY(3 ,0),HY(19,0)
++  vsub HY(59,0),HY(4 ,0),HY(20,0)
++  vsub HY(58,0),HY(5 ,0),HY(21,0)
++  vsub HY(57,0),HY(6 ,0),HY(22,0)
++  vsub HY(56,0),HY(7 ,0),HY(23,0)
++  vsub HY(55,0),HY(8 ,0),HY(24,0)
++  vsub HY(54,0),HY(9 ,0),HY(25,0)
++  vsub HY(53,0),HY(10,0),HY(26,0)
++  vsub HY(52,0),HY(11,0),HY(27,0)
++  vsub HY(51,0),HY(12,0),HY(28,0)
++  vsub HY(50,0),HY(13,0),HY(29,0)
++  vsub HY(49,0),HY(14,0),HY(30,0)
++  vsub HY(48,0),HY(15,0),HY(31,0)
++  vadd HY(48++,0),HY(48++,0),r4 REP 16   # add on rounding,
++  vasl HY(48++,0),HY(48++,0),r5 REP 16    # shift down by 7, and saturate
++  add r0,r8,32
++  vsth VX(48,32++),(r0+=r6) REP 16
++  pop pc
++
++.if USE_STACK == 0
++  .balign 32
++
++# .space directives generate 0's in the bin so avoid unnecessary padding by
++# just setting to appropriate value
++.equ intermediate_results, $+16*2
++
++# Layout goes:
++#
++#packed_buffer:
++#  .space 16*2
++#intermediate_results:
++#  .space 32*32*2
++#unpacked_buffer:
++#  .space 32*32*2
++#
++#packed_buffer2:
++#  .space 16*2
++#intermediate_results2:
++#  .space 32*32*2
++#unpacked_buffer2:
++#  .space 32*32*2
++.endif
++
++
+diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
+new file mode 100644
+index 0000000000..1c364492d0
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform10.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform10 [] = {
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x02,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x06,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x02,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x02,  0x65,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
++};
+diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
+new file mode 100644
+index 0000000000..1128a2c054
+--- /dev/null
++++ b/libavcodec/rpi_hevc_transform8.h
+@@ -0,0 +1,94 @@
++static const unsigned char rpi_hevc_transform8 [] = {
++0xa9,  0x03,  0x3e,  0x40,  0x4f,  0x40,  0x03,  0xb0,   // 0000
++0x20,  0x00,  0x0c,  0xf8,  0x38,  0x88,  0x80,  0x03,   // 0008
++0xc0,  0xf8,  0x00,  0x00,  0x40,  0xb0,  0x00,  0x02,   // 0010
++0x0c,  0xf8,  0x38,  0xa8,  0x80,  0x03,  0xc0,  0xf8,   // 0018
++0x00,  0x00,  0x00,  0x60,  0x03,  0xb0,  0x20,  0x00,   // 0020
++0x07,  0xb0,  0x00,  0x02,  0x08,  0xb0,  0x00,  0x04,   // 0028
++0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,  0x00,  0x08,   // 0030
++0x59,  0xb0,  0xc0,  0xfd,  0x0b,  0x12,  0x5b,  0x7a,   // 0038
++0x5b,  0x7c,  0x4a,  0xc3,  0x50,  0x17,  0x02,  0x6f,   // 0040
++0x02,  0x6a,  0x32,  0x18,  0x0a,  0x6a,  0x16,  0x40,   // 0048
++0x04,  0x18,  0x1a,  0x66,  0x80,  0x90,  0x32,  0x00,   // 0050
++0x0c,  0xf8,  0x38,  0x80,  0x80,  0x03,  0xc0,  0x08,   // 0058
++0x18,  0x00,  0x80,  0x90,  0x51,  0x00,  0x04,  0xff,   // 0060
++0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,  0x10,  0x00,   // 0068
++0x4c,  0xfe,  0x30,  0xc0,  0x09,  0x04,  0x20,  0x08,   // 0070
++0x00,  0x00,  0x04,  0xfc,  0x38,  0x90,  0x80,  0x02,   // 0078
++0xc0,  0x0b,  0x02,  0x00,  0x80,  0x90,  0x40,  0x00,   // 0080
++0x04,  0xff,  0x30,  0xc0,  0x80,  0x03,  0x20,  0x08,   // 0088
++0x14,  0x00,  0x4c,  0xfe,  0x30,  0xc0,  0x04,  0x04,   // 0090
++0x20,  0x08,  0x00,  0x00,  0x8c,  0xf8,  0x2c,  0xe0,   // 0098
++0x80,  0x03,  0x20,  0x30,  0x04,  0x00,  0x80,  0x45,   // 00a0
++0x71,  0x42,  0xf2,  0x8c,  0xd1,  0xc0,  0x59,  0xb0,   // 00a8
++0x40,  0x02,  0x00,  0x9e,  0x6d,  0x00,  0x29,  0x03,   // 00b0
++0x00,  0xf4,  0x38,  0x80,  0x00,  0x0c,  0xb6,  0x40,   // 00b8
++0x8c,  0xf8,  0x20,  0xe0,  0x80,  0x03,  0x00,  0x30,   // 00c0
++0x18,  0x00,  0x15,  0x40,  0x08,  0xf0,  0x38,  0x80,   // 00c8
++0x85,  0x0b,  0x66,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 00d0
++0x24,  0xe0,  0x86,  0x03,  0x0c,  0x60,  0x64,  0x08,   // 00d8
++0x46,  0x62,  0x49,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 00e0
++0x84,  0x6e,  0x07,  0x18,  0x69,  0xa0,  0x04,  0x5f,   // 00e8
++0x1c,  0x8b,  0xf7,  0xc8,  0x45,  0x76,  0x6b,  0x1f,   // 00f0
++0xb6,  0x40,  0x04,  0xb0,  0x40,  0x00,  0x05,  0xb0,   // 00f8
++0x00,  0x08,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0100
++0xa4,  0xff,  0x24,  0xcc,  0x60,  0x02,  0x00,  0xf8,   // 0108
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0110
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0118
++0x00,  0x67,  0x5a,  0x00,  0x06,  0xb4,  0x10,  0x00,   // 0120
++0xa4,  0xff,  0x24,  0xcc,  0xe0,  0x02,  0x00,  0xf8,   // 0128
++0x3e,  0x00,  0x03,  0xff,  0x37,  0xd0,  0x78,  0x03,   // 0130
++0xe0,  0x03,  0xbe,  0x0b,  0x10,  0x8b,  0xf6,  0x5b,   // 0138
++0x00,  0x67,  0x5a,  0x00,  0x00,  0xf4,  0x38,  0x80,   // 0140
++0x00,  0x04,  0x20,  0xb5,  0x00,  0x08,  0x04,  0xb0,   // 0148
++0x20,  0x00,  0x8e,  0xf8,  0x20,  0xe0,  0x80,  0x03,   // 0150
++0xc0,  0x43,  0x00,  0x00,  0x08,  0xf0,  0x38,  0x80,   // 0158
++0x81,  0x03,  0x26,  0xb5,  0xe0,  0xff,  0x88,  0xf0,   // 0160
++0x20,  0xe0,  0x86,  0x03,  0x08,  0x60,  0x64,  0x08,   // 0168
++0x46,  0x62,  0x45,  0xc3,  0x50,  0x27,  0x04,  0x6a,   // 0170
++0xa4,  0x6e,  0x7f,  0x90,  0xbf,  0xff,  0x65,  0xa0,   // 0178
++0x04,  0x07,  0x18,  0x8b,  0xf6,  0xc8,  0x41,  0x76,   // 0180
++0x6a,  0x1f,  0x5a,  0x00,  0xe1,  0x40,  0xf2,  0x40,   // 0188
++0x0f,  0x7b,  0x02,  0x6f,  0x03,  0xb0,  0x80,  0x00,   // 0190
++0x07,  0xb0,  0x00,  0x02,  0xe8,  0x00,  0x08,  0x6d,   // 0198
++0xe8,  0xbf,  0x60,  0x01,  0x03,  0x18,  0x48,  0xb0,   // 01a0
++0x20,  0x10,  0x89,  0x40,  0x1a,  0x40,  0x02,  0x6a,   // 01a8
++0x24,  0x18,  0xa1,  0x40,  0x98,  0x40,  0xf2,  0x4a,   // 01b0
++0x06,  0x1e,  0xff,  0x9f,  0xc5,  0xff,  0x21,  0xb5,   // 01b8
++0x00,  0x08,  0x98,  0x40,  0x04,  0xb0,  0x40,  0x00,   // 01c0
++0x95,  0x60,  0x80,  0x90,  0x18,  0x00,  0x48,  0xb0,   // 01c8
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x13,  0x00,   // 01d0
++0x04,  0xb0,  0x00,  0x08,  0x45,  0x60,  0x91,  0x40,   // 01d8
++0xa8,  0x40,  0x80,  0x90,  0x0c,  0x00,  0x48,  0xb0,   // 01e0
++0x00,  0x04,  0x41,  0x76,  0x80,  0x90,  0x07,  0x00,   // 01e8
++0x4a,  0xb0,  0x00,  0x08,  0xf2,  0x8c,  0xdf,  0xc0,   // 01f0
++0x29,  0x03,  0xef,  0x03,  0x0c,  0xf8,  0x38,  0x80,   // 01f8
++0x80,  0x03,  0xc0,  0xf8,  0x04,  0x00,  0x0c,  0xf8,   // 0200
++0x38,  0x84,  0xc0,  0x03,  0xc0,  0xf8,  0x04,  0x00,   // 0208
++0x00,  0x60,  0xff,  0x9f,  0x79,  0xff,  0x00,  0xb0,   // 0210
++0x00,  0x04,  0xff,  0x9f,  0x85,  0xff,  0x04,  0xff,   // 0218
++0x30,  0xcc,  0x10,  0x03,  0xe0,  0xfb,  0x3e,  0x00,   // 0220
++0x04,  0xff,  0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,   // 0228
++0x10,  0x00,  0x4c,  0xfe,  0x33,  0xcc,  0x80,  0x03,   // 0230
++0xe0,  0xfb,  0x14,  0x00,  0x80,  0x40,  0x06,  0xb0,   // 0238
++0x40,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,  0x80,  0x03,   // 0240
++0xe0,  0x63,  0x00,  0x00,  0x20,  0xf7,  0xf0,  0xcf,   // 0248
++0x10,  0x03,  0x20,  0xf7,  0xb0,  0xcf,  0x11,  0x13,   // 0250
++0x20,  0xf7,  0x70,  0xcf,  0x12,  0x23,  0x20,  0xf7,   // 0258
++0x30,  0xcf,  0x13,  0x33,  0x20,  0xf7,  0xf0,  0xce,   // 0260
++0x14,  0x43,  0x20,  0xf7,  0xb0,  0xce,  0x15,  0x53,   // 0268
++0x20,  0xf7,  0x70,  0xce,  0x16,  0x63,  0x20,  0xf7,   // 0270
++0x30,  0xce,  0x17,  0x73,  0x20,  0xf7,  0xf0,  0xcd,   // 0278
++0x18,  0x83,  0x20,  0xf7,  0xb0,  0xcd,  0x19,  0x93,   // 0280
++0x20,  0xf7,  0x70,  0xcd,  0x1a,  0xa3,  0x20,  0xf7,   // 0288
++0x30,  0xcd,  0x1b,  0xb3,  0x20,  0xf7,  0xf0,  0xcc,   // 0290
++0x1c,  0xc3,  0x20,  0xf7,  0xb0,  0xcc,  0x1d,  0xd3,   // 0298
++0x20,  0xf7,  0x70,  0xcc,  0x1e,  0xe3,  0x20,  0xf7,   // 02a0
++0x30,  0xcc,  0x1f,  0xf3,  0x04,  0xff,  0x33,  0xcc,   // 02a8
++0x80,  0x03,  0xe0,  0xfb,  0x10,  0x00,  0x4c,  0xfe,   // 02b0
++0x33,  0xcc,  0x80,  0x03,  0xe0,  0xfb,  0x14,  0x00,   // 02b8
++0x00,  0xb5,  0x20,  0x00,  0x8c,  0xf8,  0x2f,  0xe0,   // 02c0
++0x80,  0x03,  0xe0,  0x63,  0x00,  0x00,  0x6f,  0x03,   // 02c8
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d0
++0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,   // 02d8
++};
+diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
+new file mode 100644
+index 0000000000..e651e5c565
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.c
+@@ -0,0 +1,6134 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2012 - 2013 Mickael Raulet
++ * Copyright (C) 2012 - 2013 Gildas Cocherel
++ * Copyright (C) 2012 - 2013 Wassim Hamidouche
++ * Copyright (C) 2018 John Cox, Ben Avison, Peter de Rivaz for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/attributes.h"
++#include "libavutil/common.h"
++#include "libavutil/display.h"
++#include "libavutil/internal.h"
++#include "libavutil/mastering_display_metadata.h"
++#include "libavutil/md5.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/stereo3d.h"
++
++#include "decode.h"
++#include "bswapdsp.h"
++#include "bytestream.h"
++#include "golomb.h"
++#include "hevc.h"
++#include "rpi_hevc_data.h"
++#include "rpi_hevc_parse.h"
++#include "rpi_hevcdec.h"
++#include "rpi_hevc_cabac_fns.h"
++#include "profiles.h"
++#include "hwconfig.h"
++
++#include "rpi_zc_frames.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_shader_cmd.h"
++#include "rpi_hevc_shader_template.h"
++#include "rpi_zc.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "pthread.h"
++#include <stdatomic.h>
++
++#define DEBUG_DECODE_N 0   // 0 = do all, n = frames idr onwards
++
++#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
++
++#ifndef av_mod_uintp2
++static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
++{
++    return a & ((1 << p) - 1);
++}
++#   define av_mod_uintp2   av_mod_uintp2_c
++#endif
++
++const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
++
++#define MC_DUMMY_X (-32)
++#define MC_DUMMY_Y (-32)
++
++// UV & Y both have min 4x4 pred (no 2x2 chroma)
++// Allow for even spread +1 for setup, +1 for rounding
++// As we have load sharing this can (in theory) be exceeded so we have to
++// check after each CTU, but it is a good base size
++
++// Worst case (all 4x4) commands per CTU
++#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
++#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
++
++#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
++
++#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
++#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
++
++#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
++#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
++
++// Total cmds to allocate - allow for slack & setup
++#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
++#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
++
++#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
++#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
++
++// The QPU code for UV blocks only works up to a block width of 8
++#define RPI_CHROMA_BLOCK_WIDTH 8
++
++#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
++
++
++// Actual filter goes -ve, +ve, +ve, -ve using these values
++static const uint32_t rpi_filter_coefs[8] = {
++        ENCODE_COEFFS(  0,  64,   0,  0),
++        ENCODE_COEFFS(  2,  58,  10,  2),
++        ENCODE_COEFFS(  4,  54,  16,  2),
++        ENCODE_COEFFS(  6,  46,  28,  4),
++        ENCODE_COEFFS(  4,  36,  36,  4),
++        ENCODE_COEFFS(  4,  28,  46,  6),
++        ENCODE_COEFFS(  2,  16,  54,  4),
++        ENCODE_COEFFS(  2,  10,  58,  2)
++};
++
++// Function arrays by QPU
++
++static const int * const inter_pred_setup_c_qpu[12] = {
++    mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
++    mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
++};
++
++static const int * const inter_pred_setup_c10_qpu[12] = {
++    mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
++    mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
++};
++
++static const int * const inter_pred_setup_y_qpu[12] = {
++    mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
++    mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
++};
++
++static const int * const inter_pred_setup_y10_qpu[12] = {
++    mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
++    mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
++};
++
++static const int * const inter_pred_sync_qpu[12] = {
++    mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
++    mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
++    mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
++};
++
++static const int * const inter_pred_sync10_qpu[12] = {
++    mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
++    mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
++    mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
++};
++
++static const int * const inter_pred_exit_c_qpu[12] = {
++    mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
++    mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
++};
++
++static const int * const inter_pred_exit_c10_qpu[12] = {
++    mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
++    mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
++};
++
++static const int * const inter_pred_exit_y_qpu[12] = {
++    mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
++    mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
++};
++
++static const int * const inter_pred_exit_y10_qpu[12] = {
++    mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
++    mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
++};
++
++typedef struct ipe_chan_info_s
++{
++    const uint8_t bit_depth;
++    const uint8_t n;
++    const int * const * setup_fns;
++    const int * const * sync_fns;
++    const int * const * exit_fns;
++} ipe_chan_info_t;
++
++typedef struct ipe_init_info_s
++{
++    ipe_chan_info_t luma;
++    ipe_chan_info_t chroma;
++} ipe_init_info_t;
++
++static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
++{
++    switch (ln)
++    {
++        default:  // normally 0
++            *b = a;
++            break;
++        case 1:
++            a |= a << 8;
++            *(uint16_t *)b = a;
++            b += stride;
++            *(uint16_t *)b = a;
++            break;
++        case 2:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            b += stride;
++            *(uint32_t *)b = a;
++            break;
++        case 3:
++        {
++            unsigned int i;
++            uint64_t d;
++            a |= a << 8;
++            a |= a << 16;
++            d = ((uint64_t)a << 32) | a;
++            for (i = 0; i != 8; ++i, b += stride)
++                *(uint64_t *)b = d;
++            break;
++        }
++        case 4:
++        {
++            unsigned int i;
++            uint64_t d;
++            a |= a << 8;
++            a |= a << 16;
++            d = ((uint64_t)a << 32) | a;
++            for (i = 0; i != 16; ++i, b += stride)
++            {
++                *(uint64_t *)b = d;
++                *(uint64_t *)(b + 8) = d;
++            }
++            break;
++        }
++    }
++}
++
++// We expect this to be called with ln = (log2_cb_size - 3) so range =  -1..3
++// (4 not required)
++static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
++{
++    switch (ln)
++    {
++        default:  // 0 or -1
++            *b_u = a;
++            *b_l = a;
++            break;
++        case 1:
++            a |= a << 8;
++            *(uint16_t *)b_u = a;
++            *(uint16_t *)b_l = a;
++            break;
++        case 2:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)b_l = a;
++            break;
++        case 3:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)(b_u + 4) = a;
++            *(uint32_t *)b_l = a;
++            *(uint32_t *)(b_l + 4) = a;
++            break;
++        case 4:
++            a |= a << 8;
++            a |= a << 16;
++            *(uint32_t *)b_u = a;
++            *(uint32_t *)(b_u + 4) = a;
++            *(uint32_t *)(b_u + 8) = a;
++            *(uint32_t *)(b_u + 12) = a;
++            *(uint32_t *)b_l = a;
++            *(uint32_t *)(b_l + 4) = a;
++            *(uint32_t *)(b_l + 8) = a;
++            *(uint32_t *)(b_l + 12) = a;
++            break;
++    }
++}
++
++static void zap_cabac_stash(uint8_t * b, const int ln)
++{
++    switch (ln)
++    {
++        default:  // 0
++            *b = 0;
++            break;
++        case 1:
++            *(uint16_t *)b = 0;
++            break;
++        case 2:
++            *(uint32_t *)b = 0;
++            break;
++        case 3:
++            *(uint32_t *)b = 0;
++            *(uint32_t *)(b + 4) = 0;
++            break;
++    }
++}
++
++
++
++// Set a small square block of bits in a bitmap
++// Bits must be aligned on their size boundry (which will be true of all split CBs)
++static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
++{
++    unsigned int n;
++    const unsigned int sh = (x & 7);
++
++    f += (x >> 3);
++
++    av_assert2(ln <= 3);
++    av_assert2((x & ((1 << ln) - 1)) == 0);
++
++    switch (ln)
++    {
++        default:  // 1
++            f[0] |= 1 << sh;
++            break;
++        case 1:  // 3 * 2
++            n = 3 << sh;
++            f[0] |= n;
++            f[stride] |= n;
++            break;
++        case 2:  // 0xf * 4
++            n = 0xf << sh;
++            f[0] |= n;
++            f[stride] |= n;
++            f[stride * 2] |= n;
++            f[stride * 3] |= n;
++            break;
++        case 3:  // 0xff * 8
++            for (n = 0; n != 8; ++n, f += stride)
++                *f = 0xff;
++            break;
++    }
++}
++
++static const ipe_init_info_t ipe_init_infos[9] = {  // Alloc for bit depths of 8-16
++   {  // 8
++      .luma =   {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
++      .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
++   },
++   {  // 9
++      .luma =   {0},
++      .chroma = {0}
++   },
++   {  // 10
++      .luma =   {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
++      .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
++   }
++
++};
++
++static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
++{
++    const unsigned int n = ici->n;
++    const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3;  // Round down to word
++
++    ipe->n = n;
++    ipe->max_fill = q1_size - ipe->min_gap;
++    for(unsigned int i = 0; i < n; i++) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base =
++            (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
++        q->code_setup = qpu_fn(ici->setup_fns[i]);
++        q->code_sync = qpu_fn(ici->sync_fns[i]);
++        q->code_exit = qpu_fn(ici->exit_fns[i]);
++    }
++}
++
++static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
++{
++    av_assert0(bit_depth >= 8 && bit_depth <= 16);
++
++    rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
++}
++
++// Unsigned Trivial MOD
++static inline unsigned int utmod(const unsigned int x, const unsigned int n)
++{
++    return x >= n ? x - n : x;
++}
++
++// returns pq->job_n++
++static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
++{
++    unsigned int const x2 = pq->job_n;
++    pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
++    return x2;
++}
++
++static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
++{
++    pq->terminate = 0;
++    pq->job_n = 0;
++    pq->context = s;
++    pq->worker = worker;
++    pq->psem_out = psem_out;
++    pq->pass_n = n;
++    pq->started = 0;
++    sem_init(&pq->sem_in, 0, 0);
++}
++
++static void pass_queue_kill(HEVCRpiPassQueue * const pq)
++{
++    sem_destroy(&pq->sem_in);
++}
++
++static inline void rpi_sem_wait(sem_t * const sem)
++{
++    while (sem_wait(sem) != 0) {
++        av_assert0(errno == EINTR);
++    }
++}
++
++static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
++{
++    sem_post(&pq->sem_in);
++}
++
++static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Do the various passes - common with the worker code
++    for (unsigned int i = 0; i != RPI_PASSES; ++i) {
++        s->passq[i].worker(s, jb);
++    }
++}
++
++
++#if 0
++static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
++{
++    int x;
++    sem_getvalue((sem_t *)&jbc->sem_out, &x);
++    printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
++}
++#endif
++
++
++static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJob * jb;
++    HEVCRpiJobGlobal * const jbg = jbc->jbg;
++
++    pthread_mutex_lock(&jbg->lock);
++    // Check local 1st
++    if ((jb = jbc->jb1) != NULL)
++    {
++        // Only 1 - very easy :-)
++        jbc->jb1 = NULL;
++    }
++    else
++    {
++        // Now look for global free chain
++        if ((jb = jbg->free1) != NULL)
++        {
++            // Found one - unlink it
++            jbg->free1 = jb->next;
++            jb->next = NULL;
++        }
++        else
++        {
++            // Out of places to look - wait for one to become free - add to Qs
++
++            // Global
++            // If "good" lc then add after the last "good" el in the chain
++            // otherwise add to the tail
++            if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
++            {
++                // Add to end as we had to wait last time or wait Q empty
++                if ((lc->jw_prev = jbg->wait_tail) == NULL)
++                    jbg->wait_head = lc;
++                else
++                    lc->jw_prev->jw_next = lc;
++                lc->jw_next = NULL;
++                jbg->wait_tail = lc;
++            }
++            else
++            {
++                // This is a "good" lc that we need to poke into the middle
++                // of the Q
++                // We know that the Q isn't empty and there is at least one
++                // !last_progess_good el in it from the previous test
++
++                HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
++
++                if (p == NULL)
++                {
++                    // No current good els - add to head
++                    lc->jw_next = jbg->wait_head;
++                    jbg->wait_head = lc;
++                }
++                else
++                {
++                    lc->jw_next = p->jw_next;
++                    p->jw_next = lc;
++                }
++
++                lc->jw_next->jw_prev = lc;
++                lc->jw_prev = p;
++            }
++
++            // If "good" then we are now the last good waiting el
++            if (lc->last_progress_good)
++                jbg->wait_good = lc;
++
++            // Local
++            if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
++                jbc->lcw_head = lc;
++            else
++                lc->ljw_prev->ljw_next = lc;
++            lc->ljw_next = NULL;
++            jbc->lcw_tail = lc;
++        }
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++
++    if (jb == NULL)  // Need to wait
++    {
++        rpi_sem_wait(&lc->jw_sem);
++        jb = lc->jw_job;  // Set by free code
++    }
++
++    return jb;
++}
++
++
++static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
++{
++    HEVCRpiJobGlobal * const jbg = jbc0->jbg;  // This jbc only used to find jbg so we can get the lock
++    HEVCRpiJobCtl * jbc = jb->jbc_local;
++    HEVCRpiLocalContext * lc = NULL;
++
++    pthread_mutex_lock(&jbg->lock);
++
++    if (jbc != NULL)
++    {
++        av_assert1(jbc->jb1 == NULL);
++
++        // Release to Local if nothing waiting there
++        if ((lc = jbc->lcw_head) == NULL)
++            jbc->jb1 = jb;
++    }
++    else
++    {
++        // Release to global if nothing waiting there
++        if ((lc = jbg->wait_head) == NULL)
++        {
++            jb->next = jbg->free1;
++            jbg->free1 = jb;
++        }
++        else
++        {
++            // ? seems somehow mildy ugly...
++            jbc = lc->context->jbc;
++        }
++    }
++
++    if (lc != NULL)
++    {
++        // Something was waiting
++
++        // Unlink
++        // Global
++        if (lc->jw_next == NULL)
++            jbg->wait_tail = lc->jw_prev;
++        else
++            lc->jw_next->jw_prev = lc->jw_prev;
++
++        if (lc->jw_prev == NULL)
++            jbg->wait_head = lc->jw_next;
++        else
++            lc->jw_prev->jw_next = lc->jw_next;
++
++        // Local
++        if (lc->ljw_next == NULL)
++            jbc->lcw_tail = lc->ljw_prev;
++        else
++            lc->ljw_next->ljw_prev = lc->ljw_prev;
++
++        if (lc->ljw_prev == NULL)
++            jbc->lcw_head = lc->ljw_next;
++        else
++            lc->ljw_prev->ljw_next = lc->ljw_next;
++
++        // Update good if required
++        if (jbg->wait_good == lc)
++            jbg->wait_good = lc->jw_prev;
++
++        // Prod
++        lc->jw_job = jb;
++        sem_post(&lc->jw_sem);
++    }
++
++    pthread_mutex_unlock(&jbg->lock);
++}
++
++static void job_lc_kill(HEVCRpiLocalContext * const lc)
++{
++    sem_destroy(&lc->jw_sem);
++}
++
++static void job_lc_init(HEVCRpiLocalContext * const lc)
++{
++    lc->jw_next = NULL;
++    lc->jw_prev = NULL;
++    lc->ljw_next = NULL;
++    lc->ljw_prev = NULL;
++    lc->jw_job = NULL;
++    sem_init(&lc->jw_sem,  0, 0);
++}
++
++// Returns:
++//  0 if we have waited for MV or expect to wait for recon
++//  1 if we haven't waited for MV & do not need to wait for recon
++static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
++{
++    if (jb->waited) // reset by rpi_begin
++        return 0;
++    for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
++    {
++        if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
++                ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
++            return 0;
++    }
++    return 1;
++}
++
++// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
++static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl *const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    av_assert1(jb != NULL);
++
++    if (jb->ctu_ts_last < 0) {
++        return;
++    }
++
++    lc->last_progress_good = progress_good(s, jb);
++    jb->waited = !lc->last_progress_good;
++    lc->jb0 = NULL;
++
++    if (s->offload_recon)
++    {
++        pthread_mutex_lock(&jbc->in_lock);
++        jbc->offloadq[jbc->offload_in] = jb;
++        jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
++        pthread_mutex_unlock(&jbc->in_lock);
++
++        pass_queue_submit_job(s->passq + 0);  // Consumes job eventually
++    }
++    else
++    {
++        pass_queue_do_all(s, jb);  // Consumes job before return
++    }
++}
++
++
++// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
++// available to receive the next job.
++//
++// Now safe against multiple callers - needed for tiles
++// "normal" and WPP will only call here one at a time
++static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++
++    // It is legit for us to already have a job allocated - do nothing in this case
++    if (lc->jb0 != NULL)
++        return;
++
++    if (s->offload_recon)
++        rpi_sem_wait(&jbc->sem_out);  // This sem will stop this frame grabbing too much
++
++    lc->jb0 = job_alloc(jbc, lc);
++
++    rpi_begin(s, lc->jb0, lc->ts);
++}
++
++// Free up a job without submission
++static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    HEVCRpiJob * const jb = lc->jb0;
++
++    if (jb == NULL) {
++        return;
++    }
++
++    lc->jb0 = NULL;
++
++    job_free(jbc, jb);
++
++    // If offload then poke sem_out too
++    if (s->offload_recon) {
++        sem_post(&jbc->sem_out);
++    }
++}
++
++
++// Call this to wait for all jobs to have completed at the end of a frame
++// Slightly icky as there is no clean way to wait for a sem to count up
++// Not reentrant - call on main thread only
++static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
++{
++    HEVCRpiJobCtl * const jbc = s->jbc;
++    int i = 0;
++
++    // We shouldn't reach here with an unsubmitted job
++    av_assert1(lc->jb0 == NULL);
++
++    // If no offload then there can't be anything to wait for
++    if (!s->offload_recon) {
++        return;
++    }
++
++    if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
++    {
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            rpi_sem_wait(&jbc->sem_out);
++        }
++        for (i = 0; i != RPI_MAX_JOBS; ++i) {
++            sem_post(&jbc->sem_out);
++        }
++    }
++}
++
++static void * pass_worker(void *arg)
++{
++    HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
++    HEVCRpiContext *const s = pq->context;
++
++    for (;;)
++    {
++        rpi_sem_wait(&pq->sem_in);
++
++        if (pq->terminate)
++            break;
++
++        pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
++        // * should really set jb->passes_done here
++
++        sem_post(pq->psem_out);
++    }
++    return NULL;
++}
++
++static void pass_queues_start_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
++        pqs[i].started = 1;
++    }
++}
++
++static void pass_queues_term_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pqs[i].terminate = 1;
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started)
++            sem_post(&pqs[i].sem_in);
++    }
++    for (i = 0; i != RPI_PASSES; ++i)
++    {
++        if (pqs[i].started) {
++            pthread_join(pqs[i].thread, NULL);
++            pqs[i].started = 0;
++        }
++    }
++}
++
++static void pass_queues_kill_all(HEVCRpiContext *const s)
++{
++    unsigned int i;
++    HEVCRpiPassQueue * const pqs = s->passq;
++
++    for (i = 0; i != RPI_PASSES; ++i)
++        pass_queue_kill(pqs + i);
++}
++
++
++static void worker_pic_free_one(HEVCRpiJob * const jb)
++{
++    // Free coeff stuff - allocation not the same for all buffers
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (cf->s[0].buf != NULL)
++        av_freep(&cf->mptr);
++    if (cf->s[2].buf != NULL)
++        gpu_free(&cf->gptr);
++    memset(cf, 0, sizeof(*cf));
++}
++
++static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
++{
++    HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++
++    if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
++        goto fail;
++    cf->s[2].buf = (int16_t *)cf->gptr.arm;
++    cf->s[3].buf = cf->s[2].buf + coeff_count;
++
++    // Must be 64 byte aligned for our zero zapping code so over-allocate &
++    // round
++    if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
++        goto fail;
++    cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
++    return 0;
++
++fail:
++    av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
++    worker_pic_free_one(jb);
++    return -1;
++}
++
++static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
++{
++    unsigned int i;
++    for (i = 0; i != 4; ++i) {
++        cf->s[i].n = 0;
++#if RPI_COMPRESS_COEFFS        
++        cf->s[i].packed = 1;
++        cf->s[i].packed_n = 0;
++#endif
++    }
++}
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
++{
++    HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
++    int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
++    cfe->n += n;
++    return coeffs;
++}
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int val, const int field)
++{
++    if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
++        HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
++        HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
++        sem_t * sem = NULL;
++
++        av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++        if (((volatile int *)ref->tf.progress->data)[field] < val) {
++            HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
++
++            av_assert1(pwait->req == -1 && pwait->next == NULL);
++            jb->waited = 1;  // Remember that we had to wait for later scheduling
++
++            pwait->req = val;
++            pwait->next = NULL;
++            if (pstate->first == NULL)
++                pstate->first = pwait;
++            else
++                pstate->last->next = pwait;
++            pstate->last = pwait;
++            sem = &pwait->sem;
++        }
++        pthread_mutex_unlock(&pstate->lock);
++
++        if (sem != NULL) {
++            rpi_sem_wait(sem);
++        }
++    }
++}
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
++{
++    HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
++
++    ((int *)s->ref->tf.progress->data)[field] = val;
++
++    av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
++    {
++        HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
++        HEVCRpiFrameProgressWait * pwait;
++
++        while ((pwait = *ppwait) != NULL) {
++            if (pwait->req > val)
++            {
++                ppwait = &pwait->next;
++                pstate->last = pwait;
++            }
++            else
++            {
++                *ppwait = pwait->next;
++                pwait->req = -1;
++                pwait->next = NULL;
++                sem_post(&pwait->sem);
++            }
++        }
++    }
++    pthread_mutex_unlock(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
++{
++    pstate->first = NULL;
++    pstate->last = NULL;
++    pthread_mutex_init(&pstate->lock, NULL);
++}
++
++static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    pwait->req = -1;
++    pwait->next = NULL;
++    sem_init(&pwait->sem, 0, 0);
++}
++
++static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
++{
++    av_assert1(pstate->first == NULL);
++    pthread_mutex_destroy(&pstate->lock);
++}
++
++static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
++{
++    sem_destroy(&pwait->sem);
++}
++
++
++/**
++ * NOTE: Each function hls_foo correspond to the function foo in the
++ * specification (HLS stands for High Level Syntax).
++ */
++
++/**
++ * Section 5.7
++ */
++
++// Realloc the entry point arrays
++static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
++{
++    if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
++    {
++        // Round up alloc to multiple of 32
++        int a = (n + 31) & ~31;
++
++        // We don't care about the previous contents so probably fastest to simply discard
++        av_freep(&sh->entry_point_offset);
++        av_freep(&sh->offset);
++        av_freep(&sh->size);
++
++        if (a != 0)
++        {
++            sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
++            sh->offset = av_malloc_array(a, sizeof(int));
++            sh->size = av_malloc_array(a, sizeof(int));
++
++            if (!sh->entry_point_offset || !sh->offset || !sh->size) {
++                sh->num_entry_point_offsets = 0;
++                sh->offsets_allocated = 0;
++                return AVERROR(ENOMEM);
++            }
++        }
++
++        sh->offsets_allocated = a;
++    }
++
++    return 0;
++}
++
++/* free everything allocated  by pic_arrays_init() */
++static void pic_arrays_free(HEVCRpiContext *s)
++{
++    av_freep(&s->sao);
++    av_freep(&s->deblock);
++
++    av_freep(&s->cabac_stash_up);
++    s->cabac_stash_left = NULL;  // freed with _up
++
++    av_freep(&s->mvf_up);
++    av_freep(&s->mvf_left);
++
++    av_freep(&s->is_pcm);
++    av_freep(&s->is_intra_store);
++    s->is_intra = NULL;
++    av_freep(&s->rpl_tab);
++    s->rpl_tab_size = 0;
++
++    av_freep(&s->qp_y_tab);
++    av_freep(&s->tab_slice_address);
++    av_freep(&s->filter_slice_edges);
++
++    av_freep(&s->bs_horizontal);
++    s->bs_vertical = NULL;  // freed with H
++    av_freep(&s->bsf_stash_left);
++    av_freep(&s->bsf_stash_up);
++
++    av_freep(&s->rpl_up);
++    av_freep(&s->rpl_left);
++
++    alloc_entry_points(&s->sh, 0);
++
++    av_buffer_pool_uninit(&s->col_mvf_pool);
++}
++
++/* allocate arrays that depend on frame dimensions */
++static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
++{
++    const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
++    const unsigned int width            = sps->width;
++    const unsigned int height           = sps->height;
++    const unsigned int pic_size_in_cb   = ((width  >> log2_min_cb_size) + 1) *
++                           ((height >> log2_min_cb_size) + 1);
++    const unsigned int ctb_count        = sps->ctb_size;
++
++    {
++        unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
++        unsigned int h = ((height + 15) & ~15);
++
++        s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
++        s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
++    }
++
++    s->sao           = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
++    s->deblock       = av_mallocz_array(ctb_count, sizeof(*s->deblock));
++    if (!s->sao || !s->deblock)
++        goto fail;
++
++    s->cabac_stash_up  = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
++    s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
++    if (s->cabac_stash_up == NULL)
++        goto fail;
++
++    // Round width up to max ctb size
++    s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++    // * Only needed if we have H tiles
++    s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
++
++    // We can overread by 1 line & one byte in deblock so alloc & zero
++    // We don't need to zero the extra @ start of frame as it will never be
++    // written
++    s->is_pcm   = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++    s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
++    if (s->is_pcm == NULL || s->is_intra_store == NULL)
++        goto fail;
++
++    s->filter_slice_edges = av_mallocz(ctb_count);
++    s->tab_slice_address  = av_malloc_array(ctb_count,
++                                      sizeof(*s->tab_slice_address));
++    s->qp_y_tab           = av_malloc_array(pic_size_in_cb,
++                                      sizeof(*s->qp_y_tab));
++    if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
++        goto fail;
++
++    s->bs_horizontal = av_mallocz(s->bs_size * 2);
++    s->bs_vertical   = s->bs_horizontal + s->bs_size;
++    if (s->bs_horizontal == NULL)
++        goto fail;
++
++    s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
++    s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
++    if (s->rpl_left == NULL || s->rpl_up == NULL)
++        goto fail;
++
++    if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
++        (s->bsf_stash_up   = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
++        goto fail;
++
++    s->col_mvf_stride = (width + 15) >> 4;
++    s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
++                                          av_buffer_allocz);
++    if (s->col_mvf_pool == NULL)
++        goto fail;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    return AVERROR(ENOMEM);
++}
++
++static void default_pred_weight_table(HEVCRpiContext * const s)
++{
++  unsigned int i;
++  const unsigned int wt = 1 << QPU_MC_DENOM;
++  s->sh.luma_log2_weight_denom = 0;
++  s->sh.chroma_log2_weight_denom = 0;
++  for (i = 0; i < s->sh.nb_refs[L0]; i++) {
++      s->sh.luma_weight_l0[i] = wt;
++      s->sh.luma_offset_l0[i] = 0;
++      s->sh.chroma_weight_l0[i][0] = wt;
++      s->sh.chroma_weight_l0[i][1] = wt;
++      s->sh.chroma_offset_l0[i][0] = 0;
++      s->sh.chroma_offset_l0[i][1] = 0;
++  }
++  for (i = 0; i < s->sh.nb_refs[L1]; i++) {
++      s->sh.luma_weight_l1[i] = wt;
++      s->sh.luma_offset_l1[i] = 0;
++      s->sh.chroma_weight_l1[i][0] = wt;
++      s->sh.chroma_weight_l1[i][1] = wt;
++      s->sh.chroma_offset_l1[i][0] = 0;
++      s->sh.chroma_offset_l1[i][1] = 0;
++  }
++}
++
++static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
++                       const unsigned int refs,
++                       int16_t * luma_weight,   int16_t * luma_offset,
++                       int16_t * chroma_weight, int16_t * chroma_offset)
++{
++    unsigned int luma_flags;
++    unsigned int chroma_flags;
++    unsigned int i;
++    const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
++    const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
++    const unsigned int luma_weight_base    = 1 << QPU_MC_DENOM;
++    const unsigned int chroma_weight_base  = 1 << QPU_MC_DENOM;
++    const unsigned int luma_weight_shift   = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
++    const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
++
++    if (refs == 0)
++        return 0;
++
++    luma_flags = get_bits(gb, refs);
++    chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
++    i = 1 << (refs - 1);
++
++    do
++    {
++        if ((luma_flags & i) != 0)
++        {
++            const int delta_weight = get_se_golomb(gb);
++            const int offset = get_se_golomb(gb);
++            if (delta_weight < -128 || delta_weight > 127 ||
++                offset < -wp_offset_half_range || offset >= wp_offset_half_range)
++            {
++                return AVERROR_INVALIDDATA;
++            }
++            *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
++            *luma_offset++ = offset << wp_offset_bd_shift;
++        }
++        else
++        {
++            *luma_weight++ = luma_weight_base;
++            *luma_offset++ = 0;
++        }
++
++        if ((chroma_flags & i) != 0)
++        {
++            unsigned int j;
++            for (j = 0; j != 2; ++j)
++            {
++                const int delta_weight = get_se_golomb(gb);
++                const int delta_offset = get_se_golomb(gb);
++
++                if (delta_weight < -128 || delta_weight > 127 ||
++                    delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
++                {
++                    return AVERROR_INVALIDDATA;
++                }
++
++                *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
++                *chroma_offset++ = av_clip(
++                    wp_offset_half_range + delta_offset -
++                        ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
++                    -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
++            }
++        }
++        else
++        {
++            *chroma_weight++ = chroma_weight_base;
++            *chroma_weight++ = chroma_weight_base;
++            *chroma_offset++ = 0;
++            *chroma_offset++ = 0;
++        }
++    } while ((i >>= 1) != 0);
++
++    return 0;
++}
++
++static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
++{
++    int err;
++    const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
++    const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
++
++    if (luma_log2_weight_denom > 7 ||
++        chroma_log2_weight_denom > 7)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
++               luma_log2_weight_denom, chroma_log2_weight_denom);
++        return AVERROR_INVALIDDATA;
++    }
++
++    s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
++    s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
++
++    if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
++                s->sh.luma_weight_l0,      s->sh.luma_offset_l0,
++                s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
++        (err = get_weights(s, gb, s->sh.nb_refs[L1],
++                s->sh.luma_weight_l1,      s->sh.luma_offset_l1,
++                s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
++        return err;
++    }
++
++    return 0;
++}
++
++static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
++{
++    const HEVCRpiSPS *sps = s->ps.sps;
++    int max_poc_lsb    = 1 << sps->log2_max_poc_lsb;
++    int prev_delta_msb = 0;
++    unsigned int nb_sps = 0, nb_sh;
++    int i;
++
++    rps->nb_refs = 0;
++    if (!sps->long_term_ref_pics_present_flag)
++        return 0;
++
++    if (sps->num_long_term_ref_pics_sps > 0)
++        nb_sps = get_ue_golomb_long(gb);
++    nb_sh = get_ue_golomb_long(gb);
++
++    if (nb_sps > sps->num_long_term_ref_pics_sps)
++        return AVERROR_INVALIDDATA;
++    if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
++        return AVERROR_INVALIDDATA;
++
++    rps->nb_refs = nb_sh + nb_sps;
++
++    for (i = 0; i < rps->nb_refs; i++) {
++        uint8_t delta_poc_msb_present;
++
++        if (i < nb_sps) {
++            uint8_t lt_idx_sps = 0;
++
++            if (sps->num_long_term_ref_pics_sps > 1)
++                lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
++
++            rps->poc[i]  = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
++            rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
++        } else {
++            rps->poc[i]  = get_bits(gb, sps->log2_max_poc_lsb);
++            rps->used[i] = get_bits1(gb);
++        }
++
++        delta_poc_msb_present = get_bits1(gb);
++        if (delta_poc_msb_present) {
++            int64_t delta = get_ue_golomb_long(gb);
++            int64_t poc;
++
++            if (i && i != nb_sps)
++                delta += prev_delta_msb;
++
++            poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
++            if (poc != (int32_t)poc)
++                return AVERROR_INVALIDDATA;
++            rps->poc[i] = poc;
++            prev_delta_msb = delta;
++        }
++    }
++
++    return 0;
++}
++
++static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
++                                 const HEVCRpiSPS *sps)
++{
++    const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
++    const HEVCRpiWindow *ow = &sps->output_window;
++    unsigned int num = 0, den = 0;
++
++    avctx->pix_fmt             = sps->pix_fmt;
++    avctx->coded_width         = sps->width;
++    avctx->coded_height        = sps->height;
++    avctx->width               = sps->width  - ow->left_offset - ow->right_offset;
++    avctx->height              = sps->height - ow->top_offset  - ow->bottom_offset;
++    avctx->has_b_frames        = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
++    avctx->profile             = sps->ptl.general_ptl.profile_idc;
++    avctx->level               = sps->ptl.general_ptl.level_idc;
++
++    ff_set_sar(avctx, sps->vui.sar);
++
++    if (sps->vui.video_signal_type_present_flag)
++        avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
++                                                            : AVCOL_RANGE_MPEG;
++    else
++        avctx->color_range = AVCOL_RANGE_MPEG;
++
++    if (sps->vui.colour_description_present_flag) {
++        avctx->color_primaries = sps->vui.colour_primaries;
++        avctx->color_trc       = sps->vui.transfer_characteristic;
++        avctx->colorspace      = sps->vui.matrix_coeffs;
++    } else {
++        avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
++        avctx->color_trc       = AVCOL_TRC_UNSPECIFIED;
++        avctx->colorspace      = AVCOL_SPC_UNSPECIFIED;
++    }
++
++    if (vps->vps_timing_info_present_flag) {
++        num = vps->vps_num_units_in_tick;
++        den = vps->vps_time_scale;
++    } else if (sps->vui.vui_timing_info_present_flag) {
++        num = sps->vui.vui_num_units_in_tick;
++        den = sps->vui.vui_time_scale;
++    }
++
++    if (num != 0 && den != 0)
++        av_reduce(&avctx->framerate.den, &avctx->framerate.num,
++                  num, den, 1 << 30);
++}
++
++static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
++{
++    enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
++
++    // Admit to no h/w formats
++
++    *fmt++ = sps->pix_fmt;
++    *fmt = AV_PIX_FMT_NONE;
++
++    return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
++}
++
++static int is_sps_supported(const HEVCRpiSPS * const sps)
++{
++    return av_rpi_is_sand_format(sps->pix_fmt) &&
++           sps->width <= HEVC_RPI_MAX_WIDTH &&
++           sps->height <= HEVC_RPI_MAX_HEIGHT;
++}
++
++static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
++                   const enum AVPixelFormat pix_fmt)
++{
++    int ret;
++
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    s->ps.vps = NULL;
++
++    if (sps == NULL)
++        return 0;
++
++    if (!is_sps_supported(sps))
++        return AVERROR_DECODER_NOT_FOUND;
++
++    ret = pic_arrays_init(s, sps);
++    if (ret < 0)
++        goto fail;
++
++    export_stream_params(s->avctx, &s->ps, sps);
++
++    s->avctx->pix_fmt = pix_fmt;
++
++    ff_hevc_rpi_pred_init(&s->hpc,     sps->bit_depth);
++    ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
++
++    // * We don't support cross_component_prediction_enabled_flag but as that
++    //   must be 0 unless we have 4:4:4 there is no point testing for it as we
++    //   only deal with sand which is never 4:4:4
++    //   [support wouldn't be hard]
++
++    rpi_hevc_qpu_set_fns(s, sps->bit_depth);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);
++    av_freep(&s->sao_pixel_buffer_v[0]);
++
++    if (sps->sao_enabled)
++    {
++        const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
++        unsigned int c_idx;
++        size_t vsize[3] = {0};
++        size_t hsize[3] = {0};
++
++        for(c_idx = 0; c_idx < c_count; c_idx++) {
++            int w = sps->width >> ctx_hshift(s, c_idx);
++            int h = sps->height >> ctx_vshift(s, c_idx);
++            // ctb height & width are a min of 8 so this must a multiple of 16
++            // so no point rounding up!
++            hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
++            vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
++        }
++
++        // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
++        // when we have plaited chroma
++        s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
++        s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
++        s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
++        s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
++        s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
++        s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
++    }
++
++    s->ps.sps = sps;
++    s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
++
++    return 0;
++
++fail:
++    pic_arrays_free(s);
++    s->ps.sps = NULL;
++    return ret;
++}
++
++static inline int qp_offset_valid(const int qp_offset)
++{
++    return qp_offset >= -12 && qp_offset <= 12;
++}
++
++static int hls_slice_header(HEVCRpiContext * const s)
++{
++    GetBitContext * const gb = &s->HEVClc->gb;
++    RpiSliceHeader * const sh   = &s->sh;
++    int i, ret;
++
++    // Coded parameters
++    sh->first_slice_in_pic_flag = get_bits1(gb);
++    if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        if (IS_IDR(s))
++            ff_hevc_rpi_clear_refs(s);
++    }
++    sh->no_output_of_prior_pics_flag = 0;
++    if (IS_IRAP(s))
++        sh->no_output_of_prior_pics_flag = get_bits1(gb);
++
++    sh->pps_id = get_ue_golomb_long(gb);
++    if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
++        return AVERROR_INVALIDDATA;
++    }
++    if (!sh->first_slice_in_pic_flag &&
++        s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
++        av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
++        return AVERROR_INVALIDDATA;
++    }
++    s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
++    if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
++        sh->no_output_of_prior_pics_flag = 1;
++
++    if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
++        const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
++        const HEVCRpiSPS *last_sps = s->ps.sps;
++        enum AVPixelFormat pix_fmt;
++
++        if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
++            if (sps->width != last_sps->width || sps->height != last_sps->height ||
++                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
++                last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
++                sh->no_output_of_prior_pics_flag = 0;
++        }
++        ff_hevc_rpi_clear_refs(s);
++
++        ret = set_sps(s, sps, sps->pix_fmt);
++        if (ret < 0)
++            return ret;
++
++        pix_fmt = get_format(s, sps);
++        if (pix_fmt < 0)
++            return pix_fmt;
++
++//        ret = set_sps(s, sps, pix_fmt);
++//        if (ret < 0)
++//            return ret;
++
++        s->avctx->pix_fmt = pix_fmt;
++
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++    }
++
++    sh->dependent_slice_segment_flag = 0;
++    if (!sh->first_slice_in_pic_flag) {
++        int slice_address_length;
++
++        if (s->ps.pps->dependent_slice_segments_enabled_flag)
++            sh->dependent_slice_segment_flag = get_bits1(gb);
++
++        slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
++        sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
++        if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Invalid slice segment address: %u.\n",
++                   sh->slice_segment_addr);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!sh->dependent_slice_segment_flag) {
++            sh->slice_addr = sh->slice_segment_addr;
++            s->slice_idx++;
++        }
++    } else {
++        sh->slice_segment_addr = sh->slice_addr = 0;
++        s->slice_idx           = 0;
++        s->slice_initialized   = 0;
++    }
++
++    if (!sh->dependent_slice_segment_flag) {
++        s->slice_initialized = 0;
++
++        for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
++            skip_bits(gb, 1);  // slice_reserved_undetermined_flag[]
++
++        sh->slice_type = get_ue_golomb_long(gb);
++        if (!(sh->slice_type == HEVC_SLICE_I ||
++              sh->slice_type == HEVC_SLICE_P ||
++              sh->slice_type == HEVC_SLICE_B)) {
++            av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
++                   sh->slice_type);
++            return AVERROR_INVALIDDATA;
++        }
++        if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
++            av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
++            return AVERROR_INVALIDDATA;
++        }
++
++        // when flag is not present, picture is inferred to be output
++        sh->pic_output_flag = 1;
++        if (s->ps.pps->output_flag_present_flag)
++            sh->pic_output_flag = get_bits1(gb);
++
++        if (s->ps.sps->separate_colour_plane_flag)
++            sh->colour_plane_id = get_bits(gb, 2);
++
++        if (!IS_IDR(s)) {
++            int poc, pos;
++
++            sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
++            poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
++            if (!sh->first_slice_in_pic_flag && poc != s->poc) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++                poc = s->poc;
++            }
++            s->poc = poc;
++
++            sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
++            pos = get_bits_left(gb);
++            if (!sh->short_term_ref_pic_set_sps_flag) {
++                ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
++                if (ret < 0)
++                    return ret;
++
++                sh->short_term_rps = &sh->slice_rps;
++            } else {
++                int numbits, rps_idx;
++
++                if (!s->ps.sps->nb_st_rps) {
++                    av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
++                    return AVERROR_INVALIDDATA;
++                }
++
++                numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
++                rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
++                sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
++            }
++            sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            pos = get_bits_left(gb);
++            ret = decode_lt_rps(s, &sh->long_term_rps, gb);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
++                if (s->avctx->err_recognition & AV_EF_EXPLODE)
++                    return AVERROR_INVALIDDATA;
++            }
++            sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
++
++            if (s->ps.sps->sps_temporal_mvp_enabled_flag)
++                sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
++            else
++                sh->slice_temporal_mvp_enabled_flag = 0;
++        } else {
++            s->sh.short_term_rps = NULL;
++            s->poc               = 0;
++        }
++
++        /* 8.3.1 */
++        if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
++            s->nal_unit_type != HEVC_NAL_TRAIL_N &&
++            s->nal_unit_type != HEVC_NAL_TSA_N   &&
++            s->nal_unit_type != HEVC_NAL_STSA_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_N  &&
++            s->nal_unit_type != HEVC_NAL_RADL_R  &&
++            s->nal_unit_type != HEVC_NAL_RASL_N  &&
++            s->nal_unit_type != HEVC_NAL_RASL_R)
++            s->pocTid0 = s->poc;
++
++        if (s->ps.sps->sao_enabled) {
++            sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
++            if (ctx_cfmt(s) != 0) {
++                sh->slice_sample_adaptive_offset_flag[1] =
++                sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
++            }
++        } else {
++            sh->slice_sample_adaptive_offset_flag[0] = 0;
++            sh->slice_sample_adaptive_offset_flag[1] = 0;
++            sh->slice_sample_adaptive_offset_flag[2] = 0;
++        }
++
++        sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
++        if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
++            int nb_refs;
++
++            sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
++
++            if (get_bits1(gb)) { // num_ref_idx_active_override_flag
++                sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
++            }
++            if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
++                av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
++                       sh->nb_refs[L0], sh->nb_refs[L1]);
++                return AVERROR_INVALIDDATA;
++            }
++
++            sh->rpl_modification_flag[0] = 0;
++            sh->rpl_modification_flag[1] = 0;
++            nb_refs = ff_hevc_rpi_frame_nb_refs(s);
++            if (!nb_refs) {
++                av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
++                return AVERROR_INVALIDDATA;
++            }
++
++            if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
++                sh->rpl_modification_flag[0] = get_bits1(gb);
++                if (sh->rpl_modification_flag[0]) {
++                    for (i = 0; i < sh->nb_refs[L0]; i++)
++                        sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++
++                if (sh->slice_type == HEVC_SLICE_B) {
++                    sh->rpl_modification_flag[1] = get_bits1(gb);
++                    if (sh->rpl_modification_flag[1] == 1)
++                        for (i = 0; i < sh->nb_refs[L1]; i++)
++                            sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
++                }
++            }
++
++            if (sh->slice_type == HEVC_SLICE_B)
++                sh->mvd_l1_zero_flag = get_bits1(gb);
++
++            if (s->ps.pps->cabac_init_present_flag)
++                sh->cabac_init_flag = get_bits1(gb);
++            else
++                sh->cabac_init_flag = 0;
++
++            sh->collocated_ref_idx = 0;
++            if (sh->slice_temporal_mvp_enabled_flag) {
++                sh->collocated_list = L0;
++                if (sh->slice_type == HEVC_SLICE_B)
++                    sh->collocated_list = !get_bits1(gb);
++
++                if (sh->nb_refs[sh->collocated_list] > 1) {
++                    sh->collocated_ref_idx = get_ue_golomb_long(gb);
++                    if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                               "Invalid collocated_ref_idx: %d.\n",
++                               sh->collocated_ref_idx);
++                        return AVERROR_INVALIDDATA;
++                    }
++                }
++            }
++
++            if ((s->ps.pps->weighted_pred_flag   && sh->slice_type == HEVC_SLICE_P) ||
++                (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
++            {
++                if ((ret = pred_weight_table(s, gb)) != 0)
++                    return ret;
++            }
++            else
++            {
++                // Give us unit weights
++                default_pred_weight_table(s);
++            }
++
++            sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
++            if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
++                av_log(s->avctx, AV_LOG_ERROR,
++                       "Invalid number of merging MVP candidates: %d.\n",
++                       sh->max_num_merge_cand);
++                return AVERROR_INVALIDDATA;
++            }
++        }
++
++        sh->slice_qp_delta = get_se_golomb(gb);
++
++        if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
++            sh->slice_cb_qp_offset = get_se_golomb(gb);
++            sh->slice_cr_qp_offset = get_se_golomb(gb);
++            if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
++                !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
++                !qp_offset_valid(sh->slice_cr_qp_offset) ||
++                !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
++                       sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
++                       s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
++                return AVERROR_INVALIDDATA;
++            }
++        } else
++        {
++            sh->slice_cb_qp_offset = 0;
++            sh->slice_cr_qp_offset = 0;
++        }
++
++        if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
++            sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
++        else
++            sh->cu_chroma_qp_offset_enabled_flag = 0;
++
++        if (s->ps.pps->deblocking_filter_control_present_flag) {
++            int deblocking_filter_override_flag = 0;
++
++            if (s->ps.pps->deblocking_filter_override_enabled_flag)
++                deblocking_filter_override_flag = get_bits1(gb);
++
++            if (deblocking_filter_override_flag) {
++                sh->disable_deblocking_filter_flag = get_bits1(gb);
++                if (!sh->disable_deblocking_filter_flag) {
++                    int beta_offset_div2 = get_se_golomb(gb);
++                    int tc_offset_div2   = get_se_golomb(gb) ;
++                    if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
++                        tc_offset_div2   < -6 || tc_offset_div2   > 6) {
++                        av_log(s->avctx, AV_LOG_ERROR,
++                            "Invalid deblock filter offsets: %d, %d\n",
++                            beta_offset_div2, tc_offset_div2);
++                        return AVERROR_INVALIDDATA;
++                    }
++                    sh->beta_offset = beta_offset_div2 * 2;
++                    sh->tc_offset   =   tc_offset_div2 * 2;
++                }
++            } else {
++                sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
++                sh->beta_offset                    = s->ps.pps->beta_offset;
++                sh->tc_offset                      = s->ps.pps->tc_offset;
++            }
++        } else {
++            sh->disable_deblocking_filter_flag = 0;
++            sh->beta_offset                    = 0;
++            sh->tc_offset                      = 0;
++        }
++
++        if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
++            (sh->slice_sample_adaptive_offset_flag[0] ||
++             sh->slice_sample_adaptive_offset_flag[1] ||
++             !sh->disable_deblocking_filter_flag)) {
++            sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
++        } else {
++            sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
++        }
++        sh->no_dblk_boundary_flags =
++            (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
++                BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
++            (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
++                BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
++
++
++    } else if (!s->slice_initialized) {
++        av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    sh->num_entry_point_offsets = 0;
++    sh->offload_wpp = 0;
++    sh->offload_tiles = 0;
++
++    if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
++        unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
++        // It would be possible to bound this tighter but this here is simpler
++        if (num_entry_point_offsets > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
++            return AVERROR_INVALIDDATA;
++        }
++
++        sh->num_entry_point_offsets = num_entry_point_offsets;
++        if (sh->num_entry_point_offsets > 0) {
++            int offset_len = get_ue_golomb_long(gb) + 1;
++
++            if (offset_len < 1 || offset_len > 32) {
++                sh->num_entry_point_offsets = 0;
++                av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
++                return AVERROR_INVALIDDATA;
++            }
++
++            if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
++                return ret;
++            }
++
++            for (i = 0; i < sh->num_entry_point_offsets; i++) {
++                uint32_t val_minus1 = get_bits_long(gb, offset_len);
++                if (val_minus1 > (1 << 28))
++                {
++                    // We can declare offsets of > 2^28 bad without loss of generality
++                    // Will check actual bounds wrt NAL later, but this keeps
++                    // the values within bounds we can deal with easily
++                    av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
++                    return AVERROR_INVALIDDATA;
++                }
++                sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
++            }
++
++            // Do we want to offload this
++            if (s->threads_type != 0)
++            {
++                sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
++                    s->ps.pps->num_tile_columns > 1;
++                // * We only cope with WPP in a single column
++                //   Probably want to deal with that case as tiles rather than WPP anyway
++                // ?? Not actually sure that the main code deals with WPP + multi-col correctly
++                sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
++                    s->ps.pps->num_tile_columns == 1;
++            }
++        }
++    }
++
++    if (s->ps.pps->slice_header_extension_present_flag) {
++        unsigned int length = get_ue_golomb_long(gb);
++        if (length*8LL > get_bits_left(gb)) {
++            av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
++            return AVERROR_INVALIDDATA;
++        }
++        for (i = 0; i < length; i++)
++            skip_bits(gb, 8);  // slice_header_extension_data_byte
++    }
++
++    // Inferred parameters
++    sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
++    if (sh->slice_qp > 51 ||
++        sh->slice_qp < -s->ps.sps->qp_bd_offset) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "The slice_qp %d is outside the valid range "
++               "[%d, 51].\n",
++               sh->slice_qp,
++               -s->ps.sps->qp_bd_offset);
++        return AVERROR_INVALIDDATA;
++    }
++
++    if (get_bits_left(gb) < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Overread slice header by %d bits\n", -get_bits_left(gb));
++        return AVERROR_INVALIDDATA;
++    }
++
++    s->slice_initialized = 1;
++    return 0;
++}
++
++static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
++{
++    RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
++    int c_idx, i;
++
++    if (s->sh.slice_sample_adaptive_offset_flag[0] ||
++        s->sh.slice_sample_adaptive_offset_flag[1]) {
++        if ((lc->ctb_avail & AVAIL_L) != 0)
++        {
++            const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_left_flag) {
++                *sao = sao[-1];
++                return;
++            }
++        }
++        if ((lc->ctb_avail & AVAIL_U) != 0)
++        {
++            const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
++            if (sao_merge_up_flag) {
++                *sao = sao[-(int)s->ps.sps->ctb_width];
++                return;
++            }
++        }
++    }
++
++    for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
++        const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
++                                                 s->ps.pps->log2_sao_offset_scale_chroma;
++        int offset_abs[4];
++        char offset_sign[4] = {0};
++
++        if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
++            sao->type_idx[c_idx] = SAO_NOT_APPLIED;
++            continue;
++        }
++
++        if (c_idx == 2) {
++            sao->type_idx[2] = sao->type_idx[1];
++            sao->eo_class[2] = sao->eo_class[1];
++        } else {
++            sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
++        }
++
++        // ** Could use BY22 here quite plausibly - this is all bypass stuff
++        //    though only per CTB so not very timing critical
++
++        if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
++            continue;
++
++        for (i = 0; i < 4; i++)
++            offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
++
++        if (sao->type_idx[c_idx] == SAO_BAND) {
++            for (i = 0; i < 4; i++) {
++                if (offset_abs[i] != 0)
++                    offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
++            }
++            sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
++        } else if (c_idx != 2) {
++            sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
++        }
++
++        // Inferred parameters
++        sao->offset_val[c_idx][0] = 0;
++        for (i = 0; i < 4; i++) {
++            sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
++            if (sao->type_idx[c_idx] == SAO_EDGE) {
++                if (i > 1)
++                    sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            } else if (offset_sign[i]) {
++                sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
++            }
++        }
++    }
++}
++
++#if 0
++static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
++    int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx);  // 0..4
++
++    if (log2_res_scale_abs_plus1 !=  0) {
++        int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
++        lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
++                               (1 - 2 * res_scale_sign_flag);
++    } else {
++        lc->tu.res_scale_val = 0;
++    }
++
++
++    return 0;
++}
++#endif
++
++static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
++{
++    return jb->intra.cmds + jb->intra.n++;
++}
++
++#define A0(x, y, U, L, UL, UR, DL) \
++    [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
++
++#define A1(x, y, U, L, UL, UR, DL) \
++    A0((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A0((x) + 1, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A0((x) + 0, (y) + 1,  1,   (L),  (L),   1,   (DL)),  A0((x) + 1, (y) + 1,  1,    1,    1,    0,    0  )
++
++#define A2(x, y, U, L, UL, UR, DL) \
++    A1((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A1((x) + 2, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A1((x) + 0, (y) + 2,  1,   (L),  (L),   1,   (DL)),  A1((x) + 2, (y) + 2,  1,    1,    1,    0,    0  )
++
++#define A3(x, y, U, L, UL, UR, DL) \
++    A2((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A2((x) + 4, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A2((x) + 0, (y) + 4,  1,   (L),  (L),   1,   (DL)),  A2((x) + 4, (y) + 4,  1,    1,    1,    0,    0  )
++
++#define A4(x, y, U, L, UL, UR, DL) \
++    A3((x) + 0, (y) + 0, (U),  (L),  (UL), (U),  (L) ),  A3((x) + 8, (y) + 0, (U),   1,   (U),  (UR),  0  ),\
++    A3((x) + 0, (y) + 8,  1,   (L),  (L),   1,   (DL)),  A3((x) + 8, (y) + 8,  1,    1,    1,    0,    0  )
++
++static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
++{
++    const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
++    const unsigned int tb_x = x & ~ctb_mask;
++    const unsigned int tb_y = y & ~ctb_mask;
++    const unsigned int ctb_avail = lc->ctb_avail;
++
++    const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
++
++    unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
++
++    // This deals with both the U & L edges
++    if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
++        f |= AVAIL_UL;
++
++    if (x + w < lc->end_of_ctb_x)
++        f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
++    else if (tb_y == 0)
++        f |= (ctb_avail & AVAIL_UR);
++#if AVAIL_S_U - AVAIL_S_UR < 0
++#error Shift problem
++#endif
++
++    // Never any D if Y beyond eoctb
++    if (y + h < lc->end_of_ctb_y)
++        f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
++#if AVAIL_S_DL - AVAIL_S_L < 0
++#error Shift problem
++#endif
++
++//    printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
++//           lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
++//           lc->end_of_ctb_x, lc->end_of_ctb_y);
++
++    return f;
++}
++
++#undef A0
++#undef A1
++#undef A2
++#undef A3
++#undef A4
++
++static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
++                          unsigned int avail)
++{
++    // If rpi_enabled then sand - U & V done on U call
++    if (c_idx <= 1)
++    {
++        HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++        cmd->type = RPI_PRED_INTRA + c_idx;
++        cmd->size = log2_trafo_size;
++        cmd->avail = avail;
++        cmd->i_pred.x = x0;
++        cmd->i_pred.y = y0;
++        cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c :  lc->tu.intra_pred_mode;
++
++//        printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
++    }
++}
++
++#define CBF_CB0_S 0
++#define CBF_CB1_S 1 // CB1 must be CB0 + 1
++#define CBF_CR0_S 2
++#define CBF_CR1_S 3
++
++#define CBF_CB0 (1 << CBF_CB0_S)
++#define CBF_CR0 (1 << CBF_CR0_S)
++#define CBF_CB1 (1 << CBF_CB1_S)
++#define CBF_CR1 (1 << CBF_CR1_S)
++
++// * Only good for chroma_idx == 1
++static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                              const unsigned int x0, const unsigned int y0,
++                              const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
++                              const unsigned int blk_idx, const int cbf_luma,
++                              const unsigned int cbf_chroma)
++{
++    const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
++    const unsigned int x0_c = x0 & ~7;
++    const unsigned int y0_c = y0 & ~7;
++
++    enum ScanType scan_idx   = SCAN_DIAG;
++    enum ScanType scan_idx_c = SCAN_DIAG;
++
++    if (lc->cu.pred_mode == MODE_INTRA)
++    {
++        const unsigned int trafo_size = 1 << log2_trafo_size;
++        const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
++
++        do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
++
++        if (log2_trafo_size > 2)
++            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
++        else if (blk_idx == 3)
++            do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
++                          ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
++
++        if (log2_trafo_size < 4) {
++            if (lc->tu.intra_pred_mode >= 6 &&
++                lc->tu.intra_pred_mode <= 14) {
++                scan_idx = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode >= 22 &&
++                       lc->tu.intra_pred_mode <= 30) {
++                scan_idx = SCAN_HORIZ;
++            }
++
++            if (lc->tu.intra_pred_mode_c >=  6 &&
++                lc->tu.intra_pred_mode_c <= 14) {
++                scan_idx_c = SCAN_VERT;
++            } else if (lc->tu.intra_pred_mode_c >= 22 &&
++                       lc->tu.intra_pred_mode_c <= 30) {
++                scan_idx_c = SCAN_HORIZ;
++            }
++        }
++    }
++
++    if (!cbf_luma && cbf_chroma == 0)
++        return 0;
++
++    if (lc->tu.is_cu_qp_delta_wanted)
++    {
++        const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
++        const unsigned int cb_mask = ~0U << log2_cb_size;
++
++        if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
++            qp_delta >  (25 + (s->ps.sps->qp_bd_offset >> 1)))
++        {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "The cu_qp_delta %d is outside the valid range "
++                   "[%d, %d].\n",
++                   qp_delta,
++                   -(26 + (s->ps.sps->qp_bd_offset >> 1)),
++                    (25 + (s->ps.sps->qp_bd_offset >> 1)));
++            return AVERROR_INVALIDDATA;
++        }
++
++        lc->tu.is_cu_qp_delta_wanted = 0;
++        lc->tu.cu_qp_delta = qp_delta;
++        ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
++    }
++
++    // * Not main profile & untested due to no conform streams
++    if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
++        !lc->cu.cu_transquant_bypass_flag) {
++        int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
++        if (cu_chroma_qp_offset_flag) {
++            int cu_chroma_qp_offset_idx  = 0;
++            if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
++                cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
++            }
++            lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
++            lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
++        }
++        lc->tu.cu_chroma_qp_offset_wanted = 0;
++    }
++
++    if (cbf_luma)
++        ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
++
++    if (log2_trafo_size > 2 || blk_idx == 3)
++    {
++        if ((cbf_chroma & CBF_CB0) != 0)
++            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++                                        log2_trafo_size_c, scan_idx_c, 1);
++        if ((cbf_chroma & CBF_CR0) != 0)
++            ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
++                                        log2_trafo_size_c, scan_idx_c, 2);
++    }
++
++    return 0;
++}
++
++static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
++{
++    set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
++}
++
++
++static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                              const unsigned int x0, const unsigned int y0,
++                              const unsigned int log2_trafo_size,
++                              const unsigned int trafo_depth, const unsigned int blk_idx,
++                              const unsigned int cbf_c0)
++{
++    // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
++    unsigned int cbf_c1 = cbf_c0;
++    int split_transform_flag;
++    int ret;
++
++    if (lc->cu.intra_split_flag) {
++        if (trafo_depth == 1) {
++            lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[blk_idx];
++            if (ctx_cfmt(s) == 3) {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[blk_idx];
++            } else {
++                lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++                lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++            }
++        }
++    } else {
++        lc->tu.intra_pred_mode   = lc->pu.intra_pred_mode[0];
++        lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
++        lc->tu.chroma_mode_c     = lc->pu.chroma_mode_c[0];
++    }
++
++    if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
++        log2_trafo_size >  s->ps.sps->log2_min_tb_size    &&
++        trafo_depth     < lc->cu.max_trafo_depth       &&
++        !(lc->cu.intra_split_flag && trafo_depth == 0))
++    {
++        split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
++    } else {
++        int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
++                          lc->cu.pred_mode == MODE_INTER &&
++                          lc->cu.part_mode != PART_2Nx2N &&
++                          trafo_depth == 0;
++
++        split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
++                               (lc->cu.intra_split_flag && trafo_depth == 0) ||
++                               inter_split;
++    }
++
++    if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
++    {
++        const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
++        cbf_c1 = 0;
++
++        if ((cbf_c0 & CBF_CB0) != 0)
++        {
++            cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
++            if (wants_c1)
++                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
++        }
++
++        if ((cbf_c0 & CBF_CR0) != 0)
++        {
++            cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
++            if (wants_c1)
++                cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
++        }
++    }
++
++    if (split_transform_flag) {
++        const int trafo_size_split = 1 << (log2_trafo_size - 1);
++        const int x1 = x0 + trafo_size_split;
++        const int y1 = y0 + trafo_size_split;
++
++#define SUBDIVIDE(x, y, idx)                                                    \
++do {                                                                            \
++    ret = hls_transform_tree(s, lc, x, y,                                       \
++                             log2_trafo_size - 1, trafo_depth + 1, idx,         \
++                             cbf_c1);                                           \
++    if (ret < 0)                                                                \
++        return ret;                                                             \
++} while (0)
++
++        SUBDIVIDE(x0, y0, 0);
++        SUBDIVIDE(x1, y0, 1);
++        SUBDIVIDE(x0, y1, 2);
++        SUBDIVIDE(x1, y1, 3);
++
++#undef SUBDIVIDE
++    } else {
++        // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
++        // trafo_size == 2 with depth == 0 the issue is moot
++        const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
++            ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
++
++        ret = hls_transform_unit(s, lc, x0, y0,
++                                 log2_trafo_size + trafo_depth, log2_trafo_size,
++                                 blk_idx, cbf_luma, cbf_c1);
++        if (ret < 0)
++            return ret;
++
++        if (!s->sh.disable_deblocking_filter_flag) {
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
++        }
++    }
++    return 0;
++}
++
++
++static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
++{
++    GetBitContext gb;
++    int ret;
++
++    ret = init_get_bits(&gb, pcm, length);
++    if (ret < 0)
++        return ret;
++
++    s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
++                       frame_stride1(s->frame, 0),
++                       cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
++
++    s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
++                       s->frame->linesize[1],
++                       cb_size >> ctx_hshift(s, 1),
++                       cb_size >> ctx_vshift(s, 1),
++                       &gb, s->ps.sps->pcm.bit_depth_chroma);
++
++    return 0;
++}
++
++
++// x * 2^(y*2)
++static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
++{
++    return x << (y * 2);
++}
++
++static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
++{
++    // Length in bits
++    const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
++        xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
++
++    const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
++
++    if (!s->sh.disable_deblocking_filter_flag)
++        ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++
++    // Copy coeffs
++    {
++        const int blen = (length + 7) >> 3;
++        // Round allocated bytes up to nearest 32 to avoid alignment confusion
++        // Allocation is in int16_t s
++        // As we are only using 1 byte per sample and the coeff buffer allows 2 per
++        // sample this rounding doesn't affect the total size we need to allocate for
++        // the coeff buffer
++        int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
++        memcpy(coeffs, pcm, blen);
++
++        // Our coeff stash assumes that any partially allocated 64byte lump
++        // is zeroed so make that true.
++        {
++            uint8_t * const eopcm = (uint8_t *)coeffs + blen;
++            if ((-(intptr_t)eopcm & 63) != 0)
++                memset(eopcm, 0, -(intptr_t)eopcm & 63);
++        }
++
++        // Add command
++        {
++            HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
++            cmd->type = RPI_PRED_I_PCM;
++            cmd->size = log2_cb_size;
++            cmd->i_pcm.src = coeffs;
++            cmd->i_pcm.x = x0;
++            cmd->i_pcm.y = y0;
++            cmd->i_pcm.src_len = length;
++        }
++        return 0;
++    }
++}
++
++
++static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
++                                const MvXY xy, const int y0, const int height)
++{
++    if (s->threads_type != 0) {
++        const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
++
++        // Progress has to be attached to current job as the actual wait
++        // is in worker_core which can't use lc
++        int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
++        if (*pr < y) {
++            *pr = y;
++        }
++    }
++}
++
++static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                  const int x0, const int y0, const int nPbW,
++                                  const int nPbH,
++                                  HEVCRpiMvField * const mv)
++{
++    enum InterPredIdc inter_pred_idc = PRED_L0;
++    int mvp_flag;
++    const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
++
++    mv->pred_flag = 0;
++    if (s->sh.slice_type == HEVC_SLICE_B)
++        inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
++
++    if (inter_pred_idc != PRED_L1) {
++        MvXY mvd;
++
++        if (s->sh.nb_refs[L0])
++            mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
++
++        mv->pred_flag = PF_L0;
++        mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 0);
++        mv->xy[0] = mvxy_add(mv->xy[0], mvd);
++    }
++
++    if (inter_pred_idc != PRED_L0) {
++        MvXY mvd = 0;
++
++        if (s->sh.nb_refs[L1])
++            mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
++
++        if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
++            mvd = ff_hevc_rpi_hls_mvd_coding(lc);
++
++        mv->pred_flag += PF_L1;
++        mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
++        ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
++                                 mv, mvp_flag, 1);
++        mv->xy[1] = mvxy_add(mv->xy[1], mvd);
++    }
++}
++
++
++static HEVCRpiInterPredQ *
++rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
++{
++    HEVCRpiInterPredQ * yp = NULL;
++    HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
++    const unsigned int max_fill = ipe->max_fill;
++    unsigned int load = UINT_MAX;
++
++    for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
++        // We will always have enough room between the Qs but if we are
++        // running critically low due to poor scheduling then use fill size
++        // rather than load to determine QPU.  This has obvious dire
++        // performance implications but (a) it is better than crashing
++        // and (b) it should (almost) never happen
++        const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
++        const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
++
++        if (tload < load)
++        {
++            yp = ypt;
++            load = tload;
++        }
++    }
++
++    yp->load += load_val;
++    ipe->used_grp = 1;
++    qpu_mc_link_set(yp->qpu_mc_curr, fn);
++
++    return yp;
++}
++
++
++static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
++{
++    for (unsigned int i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
++
++        qpu_mc_link_set(q->qpu_mc_curr, q->code_sync);
++        q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(&q->qpu_mc_curr->sync + 1);
++        q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
++    }
++}
++
++// Returns 0 on success
++// We no longer check for Q fullness as wew have emergncy code in ctu alloc
++// * However it might be an idea to have some means of spotting that we've used it
++static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
++{
++    if (!ipe->used_grp)
++        return 0;
++
++    if ((ipe->curr += ipe->n_grp) >= ipe->n)
++    {
++        ipe->curr = 0;
++        rpi_inter_pred_sync(ipe);
++    }
++    ipe->used = 1;
++    ipe->used_grp = 0;
++
++    return 0;
++}
++
++static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++
++    ipe->curr = 0;
++    ipe->used = 0;
++    ipe->used_grp = 0;
++    for (i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const q = ipe->q + i;
++        q->qpu_mc_curr = q->qpu_mc_base;
++        q->load = 0;
++        q->last_l0 = NULL;
++        q->last_l1 = NULL;
++    }
++}
++
++static int rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
++                                 const unsigned int n_max, const unsigned int n_grp,
++                                 const unsigned int total_size, const unsigned int min_gap)
++{
++    int rv;
++
++    memset(ipe, 0, sizeof(*ipe));
++    if ((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) == NULL)
++        return AVERROR(ENOMEM);
++
++    ipe->n_grp = n_grp;
++    ipe->min_gap = min_gap;
++
++    if ((rv = gpu_malloc_cached(total_size, &ipe->gptr)) != 0)
++        av_freep(&ipe->q);
++    return rv;
++}
++
++
++#if RPI_QPU_EMU_Y
++#define get_mc_address_y(f) ((f)->data[0])
++#else
++#define get_mc_address_y(f) get_vc_address_y(f)
++#endif
++#if RPI_QPU_EMU_C
++#define get_mc_address_u(f) ((f)->data[1])
++#else
++#define get_mc_address_u(f) get_vc_address_u(f)
++#endif
++
++static inline uint32_t pack_wo_p(const int off, const int mul)
++{
++    return PACK2(off * 2 + 1, mul);
++}
++
++static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
++{
++    return PACK2(off0 + off1 + 1, mul);
++}
++
++
++static void
++rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const MvXY mv_xy,
++           const int weight_mul,
++           const int weight_offset,
++           AVFrame *const src_frame)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const unsigned int mx          = MV_X(mv_xy) & 3;
++    const unsigned int my          = MV_Y(mv_xy) & 3;
++    const unsigned int my_mx       = (my << 8) | mx;
++    const uint32_t     my2_mx2_my_mx = (my_mx << 16) | my_mx;
++    const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
++    qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
++    const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++
++    if (my_mx == 0)
++    {
++        const int x1 = x0 + (MV_X(mv_xy) >> 2);
++        const int y1 = y0 + (MV_Y(mv_xy) >> 2);
++        const int bh = nPbH;
++
++        for (int start_x = 0; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
++
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                ++ts->y_pred1_x0y0;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src_vc_address_y;
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->wo1 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
++        const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
++        const unsigned int bh = nPbH;
++        int start_x = 0;
++
++#if 1
++        // As Y-pred operates on two independant 8-wide src blocks we can merge
++        // this pred with the previous one if it the previous one is 8 pel wide,
++        // the same height as the current block, immediately to the left of our
++        // current dest block and mono-pred.
++
++        qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
++        if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
++        {
++            const int bw = FFMIN(nPbW, 8);
++            qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
++
++            last_y8_src2->x = x1_m3;
++            last_y8_src2->y = y1_m3;
++            last_y8_src2->base = src_vc_address_y;
++            last_y8_p->w += bw;
++            last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
++            last_y8_p->wo2 = wo;
++
++            jb->last_y8_p = NULL;
++            jb->last_y8_l1 = NULL;
++            start_x = bw;
++#if RPI_TSTATS
++            ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
++#endif
++        }
++#endif
++
++        for (; start_x < nPbW; start_x += 16)
++        {
++            const int bw = FFMIN(nPbW - start_x, 16);
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                if (mx == 0 && my == 0)
++                    ++ts->y_pred1_x0y0;
++                else if (mx == 0)
++                    ++ts->y_pred1_x0;
++                else if (my == 0)
++                    ++ts->y_pred1_y0;
++                else
++                    ++ts->y_pred1_xy;
++
++                if (nPbW > 8)
++                    ++ts->y_pred1_wgt8;
++                else
++                    ++ts->y_pred1_wle8;
++
++                if (nPbH > 16)
++                    ++ts->y_pred1_hgt16;
++                else
++                    ++ts->y_pred1_hle16;
++            }
++#endif
++            src1->x = x1_m3 + start_x;
++            src1->y = y1_m3;
++            src1->base = src_vc_address_y;
++            if (bw <= 8)
++            {
++                src2->x = MC_DUMMY_X;
++                src2->y = MC_DUMMY_Y;
++#if RPI_QPU_EMU_Y
++                src2->base = s->qpu_dummy_frame_emu;
++#else
++                src2->base = s->qpu_dummy_frame_qpu;
++#endif
++            }
++            else
++            {
++                src2->x = x1_m3 + start_x + 8;
++                src2->y = y1_m3;
++                src2->base = src_vc_address_y;
++            }
++            cmd_y->w = bw;
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo;
++            cmd_y->wo2 = wo;
++            cmd_y->dst_addr =  dst_addr + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++
++            if (bw == 8) {
++                jb->last_y8_l1 = src2;
++                jb->last_y8_p = cmd_y;
++            }
++        }
++    }
++}
++
++static void
++rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++           const int x0, const int y0,
++           const int nPbW, const int nPbH,
++           const struct HEVCRpiMvField *const mv_field,
++           const AVFrame *const src_frame,
++           const AVFrame *const src_frame2)
++{
++    const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
++    const MvXY mv  = mv_field->xy[0];
++    const MvXY mv2 = mv_field->xy[1];
++
++    const unsigned int mx          = MV_X(mv) & 3;
++    const unsigned int my          = MV_Y(mv) & 3;
++    const unsigned int my_mx = (my<<8) | mx;
++    const unsigned int mx2          = MV_X(mv2) & 3;
++    const unsigned int my2          = MV_Y(mv2) & 3;
++    const unsigned int my2_mx2 = (my2<<8) | mx2;
++    const uint32_t     my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
++    const unsigned int ref_idx0 = mv_field->ref_idx[0];
++    const unsigned int ref_idx1 = mv_field->ref_idx[1];
++    const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
++    const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
++
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
++    qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
++
++    if (my2_mx2_my_mx == 0)
++    {
++        const int x1 = x0 + (MV_X(mv) >> 2);
++        const int y1 = y0 + (MV_Y(mv) >> 2);
++        const int x2 = x0 + (MV_X(mv2) >> 2);
++        const int y2 = y0 + (MV_Y(mv2) >> 2);
++        const int bh = nPbH;
++
++        // Can do chunks a full 16 wide if we don't want the H filter
++        for (int start_x=0; start_x < nPbW; start_x += 16)
++        {
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                ++ts->y_pred2_x0y0;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 16);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = 0;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++    else
++    {
++        // Filter requires a run-up of 3
++        const int x1 = x0 + (MV_X(mv) >> 2) - 3;
++        const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
++        const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
++        const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
++        const int bh = nPbH;
++
++        for (int start_x=0; start_x < nPbW; start_x += 8)
++        { // B blocks work 8 at a time
++            // B weights aren't doubled as the QPU code does the same
++            // amount of work as it does for P
++            HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
++            qpu_mc_src_t *const src1 = yp->last_l0;
++            qpu_mc_src_t *const src2 = yp->last_l1;
++            qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
++#if RPI_TSTATS
++            {
++                HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
++                const unsigned int mmx = mx | mx2;
++                const unsigned int mmy = my | my2;
++                if (mmx == 0 && mmy == 0)
++                    ++ts->y_pred2_x0y0;
++                else if (mmx == 0)
++                    ++ts->y_pred2_x0;
++                else if (mmy == 0)
++                    ++ts->y_pred2_y0;
++                else
++                    ++ts->y_pred2_xy;
++
++                if (nPbH > 16)
++                    ++ts->y_pred2_hgt16;
++                else
++                    ++ts->y_pred2_hle16;
++            }
++#endif
++            src1->x = x1 + start_x;
++            src1->y = y1;
++            src1->base = src1_base;
++            src2->x = x2 + start_x;
++            src2->y = y2;
++            src2->base = src2_base;
++            cmd_y->w = FFMIN(nPbW - start_x, 8);
++            cmd_y->h = bh;
++            cmd_y->mymx21 = my2_mx2_my_mx;
++            cmd_y->wo1 = wo1;
++            cmd_y->wo2 = wo2;
++            cmd_y->dst_addr =  dst + (start_x << xshl);
++            yp->last_l0 = &cmd_y->next_src1;
++            yp->last_l1 = &cmd_y->next_src2;
++            yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
++        }
++    }
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const unsigned int lx, const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const MvXY mv,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  AVFrame * const src_frame)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // = s->ps.sps->hshift[1];
++    const int vshift = 1; // = s->ps.sps->vshift[1];
++
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++    const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
++    const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
++    const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
++    const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
++    const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
++    qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++    const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
++
++    for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
++    {
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
++        qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
++        qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
++        qpu_mc_src_t * const last_lx = *plast_lx;
++        const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        last_lx->x = x1_c + start_x;
++        last_lx->y = y1_c;
++        last_lx->base = src_base_u;
++        cmd_c->h = bh;
++        cmd_c->w = bw;
++        cmd_c->coeffs_x = x_coeffs;
++        cmd_c->coeffs_y = y_coeffs;
++        cmd_c->wo_u = wo_u;
++        cmd_c->wo_v = wo_v;
++        cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
++        *plast_lx = &cmd_c->next_src;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
++    }
++    return;
++}
++
++// h/v shifts fixed at one as that is all the qasm copes with
++static void
++rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++  const int x0_c, const int y0_c,
++  const int nPbW_c, const int nPbH_c,
++  const struct HEVCRpiMvField * const mv_field,
++  const int16_t * const c_weights,
++  const int16_t * const c_offsets,
++  const int16_t * const c_weights2,
++  const int16_t * const c_offsets2,
++  AVFrame * const src_frame,
++  AVFrame * const src_frame2)
++{
++    const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
++    const int hshift = 1; // s->ps.sps->hshift[1];
++    const int vshift = 1; // s->ps.sps->vshift[1];
++    const MvXY mv = mv_field->xy[0];
++    const MvXY mv2 = mv_field->xy[1];
++
++    const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
++    const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
++    const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
++    const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
++    const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
++    const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
++
++    const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
++    const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
++    const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
++    const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
++
++    const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
++    const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
++
++    const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
++    const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
++
++    const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
++    const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
++    const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
++    HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
++    const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
++    const unsigned int bh = nPbH_c;
++
++    for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
++    {
++        const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
++
++        HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
++        qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
++        qpu_mc_src_t * const src_l0 = cp->last_l0;
++        qpu_mc_src_t * const src_l1 = cp->last_l1;
++
++        src_l0->x = x1_c + start_x;
++        src_l0->y = y1_c;
++        src_l0->base = src1_base;
++        src_l1->x = x2_c + start_x;
++        src_l1->y = y2_c;
++        src_l1->base = src2_base;
++
++        u[0].h = bh;
++        u[0].w = bw;
++        u[0].coeffs_x1 = coefs0_x;
++        u[0].coeffs_y1 = coefs0_y;
++        u[0].weight_u1 = c_weights[0]; // Weight L0 U
++        u[0].weight_v1 = c_weights[1]; // Weight L0 V
++        u[0].coeffs_x2 = coefs1_x;
++        u[0].coeffs_y2 = coefs1_y;
++        u[0].wo_u2 = wo_u2;
++        u[0].wo_v2 = wo_v2;
++        u[0].dst_addr_c = dst_base_u + (start_x << xshl);
++
++        cp->last_l0 = &u[0].next_src1;
++        cp->last_l1 = &u[0].next_src2;
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++}
++
++
++static inline void
++col_stash(const HEVCRpiContext * const s,
++          const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
++          const HEVCRpiMvField * const mvf)
++{
++    ColMvField * const col_mvf = s->ref->col_mvf;
++    const unsigned int x = (x0 + 15) >> 4;
++    const unsigned int y = (y0 + 15) >> 4;
++    const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
++    const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
++
++    if (col_mvf != NULL && w != 0 && h != 0)
++    {
++        // Only record MV from the top left of the 16x16 block
++
++        const RefPicList * const rpl = s->refPicList;
++        const ColMvField cmv = {
++            .L = {
++                {
++                    .poc = (mvf->pred_flag & PF_L0) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
++                    .xy = mvf->xy[0]
++                },
++                {
++                    .poc = (mvf->pred_flag & PF_L1) == 0 ?
++                            COL_POC_INTRA :
++                            COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
++                    .xy = mvf->xy[1]
++                }
++            }
++        };
++
++        ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
++        const unsigned int stride = s->col_mvf_stride - w;
++        unsigned int j = h;
++
++        do
++        {
++            unsigned int k = w;
++            do
++            {
++                *p++ = cmv;
++            } while (--k != 0);
++            p += stride;
++        } while (--j != 0);
++    }
++}
++
++static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                const unsigned int x0, const unsigned int y0,
++                                const unsigned int nPbW, const unsigned int nPbH,
++                                const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
++{
++    HEVCRpiJob * const jb = lc->jb0;
++
++    struct HEVCRpiMvField current_mv = {{0}};
++    const RefPicList  *const refPicList = s->refPicList;
++    const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
++
++    if (lc->cu.pred_mode != MODE_SKIP)
++        lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
++
++    if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
++        const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
++            ff_hevc_rpi_merge_idx_decode(s, lc);
++
++        ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
++                                   partIdx, merge_idx, &current_mv);
++    } else {
++        hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
++    }
++
++    {
++        HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++        unsigned int i, j;
++
++        for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
++        {
++            for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
++                p[i] = current_mv;
++            p += MVF_STASH_WIDTH_PU;
++        }
++    }
++
++    col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
++
++    if (current_mv.pred_flag & PF_L0) {
++        ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
++        if (!ref0)
++            return;
++        hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
++    }
++    if (current_mv.pred_flag & PF_L1) {
++        ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
++        if (!ref1)
++            return;
++        hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
++    }
++
++    if (current_mv.pred_flag == PF_L0) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
++          s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
++          ref0->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
++              s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++              ref0->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_L1) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
++          s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
++          ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++            rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
++              s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++              ref1->frame);
++            return;
++        }
++    } else if (current_mv.pred_flag == PF_BI) {
++        const int x0_c = x0 >> ctx_hshift(s, 1);
++        const int y0_c = y0 >> ctx_vshift(s, 1);
++        const int nPbW_c = nPbW >> ctx_hshift(s, 1);
++        const int nPbH_c = nPbH >> ctx_vshift(s, 1);
++
++        rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
++
++        if (ctx_cfmt(s) != 0) {
++          rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
++                       &current_mv,
++                       s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
++                       s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
++                       s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
++                       ref0->frame,
++                       ref1->frame);
++            return;
++        }
++    }
++}
++
++static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                    const unsigned int x0, const unsigned int y0,
++                    const unsigned int log2_cb_size,
++                    const unsigned int ipm)
++{
++    const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
++    const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
++
++    {
++        const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
++        set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
++    }
++
++    // If IRAP then everything is Intra & we avoid ever looking at these
++    // stashes so don't bother setting them
++    if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
++    {
++        if (s->is_intra != NULL)
++        {
++            set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
++        }
++
++        {
++            HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
++            const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
++            unsigned int n = size_in_pus;
++
++            do
++            {
++                memset(p, 0, size_in_pus * sizeof(*p));
++                p += MVF_STASH_WIDTH_PU;
++            } while (--n != 0);
++        }
++
++
++        if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
++        {
++            // Only record top left stuff
++            // Blocks should always be alinged on size boundries
++            // so cannot have overflow from a small block
++
++            ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
++            const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
++            const unsigned int stride = s->col_mvf_stride - size_in_col;
++            unsigned int j = size_in_col;
++
++            do
++            {
++                unsigned int k = size_in_col;
++                do
++                {
++                    p->L[0].poc = COL_POC_INTRA;
++                    p->L[0].xy = 0;
++                    p->L[1].poc = COL_POC_INTRA;
++                    p->L[1].xy = 0;
++                    ++p;
++                } while (--k != 0);
++                p += stride;
++            } while (--j != 0);
++        }
++    }
++}
++
++static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                                const unsigned int x0, const unsigned int y0,
++                                                const unsigned int log2_cb_size)
++{
++    set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
++}
++
++
++/**
++ * 8.4.1
++ */
++static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                int x0, int y0, int log2_pu_size,
++                                int prev_intra_luma_pred_flag,
++                                const unsigned int idx)
++{
++    const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++    const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
++
++    // Up does not cross boundries so as we always scan 1 slice-tile-line in an
++    // lc we can just keep 1 CTB lR stashes
++    // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
++    const unsigned int cand_up   = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
++    const unsigned int cand_left = lc->ipm_left[yb_pu];
++
++    unsigned int intra_pred_mode;
++    unsigned int a, b, c;
++
++    if (cand_left == cand_up) {
++        if (cand_left < 2) {
++            a = INTRA_PLANAR;
++            b = INTRA_DC;
++            c = INTRA_ANGULAR_26;
++        } else {
++            a = cand_left;
++            b = 2 + ((cand_left - 2 - 1 + 32) & 31);
++            c = 2 + ((cand_left - 2 + 1) & 31);
++        }
++    } else {
++        a = cand_left;
++        b = cand_up;
++        c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
++                INTRA_PLANAR :
++            (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
++                INTRA_DC :
++                INTRA_ANGULAR_26;
++    }
++
++    if (prev_intra_luma_pred_flag) {
++        intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
++    } else {
++        // Sort lowest 1st
++        if (a > b)
++            FFSWAP(int, a, b);
++        if (a > c)
++            FFSWAP(int, a, c);
++        if (b > c)
++            FFSWAP(int, b, c);
++
++        intra_pred_mode = idx;
++        if (intra_pred_mode >= a)
++            intra_pred_mode++;
++        if (intra_pred_mode >= b)
++            intra_pred_mode++;
++        if (intra_pred_mode >= c)
++            intra_pred_mode++;
++    }
++
++    /* write the intra prediction units into the mv array */
++    set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
++    return intra_pred_mode;
++}
++
++static const uint8_t tab_mode_idx[] = {
++     0,  1,  2,  2,  2,  2,  3,  5,  7,  8, 10, 12, 13, 15, 17, 18, 19, 20,
++    21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
++
++static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                  const unsigned int x0, const unsigned int y0,
++                                  const unsigned int log2_cb_size)
++{
++    static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
++    uint8_t prev_intra_luma_pred_flag[4];
++    int split   = lc->cu.part_mode == PART_NxN;
++    const unsigned int split_size = (1 << (log2_cb_size - 1));
++    int chroma_mode;
++    const unsigned int n = split ? 4 : 1;
++    unsigned int i;
++
++    for (i = 0; i != n; i++)
++        prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
++
++    for (i = 0; i < n; i++) {
++        // depending on mode idx is mpm or luma_pred_mode
++        const unsigned int idx = prev_intra_luma_pred_flag[i] ?
++            ff_hevc_rpi_mpm_idx_decode(lc) :
++            ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
++
++        lc->pu.intra_pred_mode[i] =
++            luma_intra_pred_mode(s, lc,
++                                 x0 + ((i & 1) == 0 ? 0 : split_size),
++                                 y0 + ((i & 2) == 0 ? 0 : split_size),
++                                 log2_cb_size - split,
++                                 prev_intra_luma_pred_flag[i], idx);
++    }
++
++    if (ctx_cfmt(s) == 3) {
++        for (i = 0; i < n; i++) {
++            lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++            if (chroma_mode != 4) {
++                if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
++                    lc->pu.intra_pred_mode_c[i] = 34;
++                else
++                    lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
++            } else {
++                lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
++            }
++        }
++    } else if (ctx_cfmt(s) == 2) {
++        int mode_idx;
++        lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                mode_idx = 34;
++            else
++                mode_idx = intra_chroma_table[chroma_mode];
++        } else {
++            mode_idx = lc->pu.intra_pred_mode[0];
++        }
++        lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
++    } else if (ctx_cfmt(s) != 0) {
++        chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
++        if (chroma_mode != 4) {
++            if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
++                lc->pu.intra_pred_mode_c[0] = 34;
++            else
++                lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
++        } else {
++            lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
++        }
++    }
++}
++
++static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                           const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
++{
++    const unsigned int cb_size          = 1 << log2_cb_size;
++    const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
++    const unsigned int min_cb_width     = s->ps.sps->min_cb_width;
++    const unsigned int x_cb             = x0 >> log2_min_cb_size;
++    const unsigned int y_cb             = y0 >> log2_min_cb_size;
++    const unsigned int idx              = log2_cb_size - 2;
++    const unsigned int qp_block_mask    = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++    int skip_flag = 0;
++
++    lc->cu.x                = x0;
++    lc->cu.y                = y0;
++    lc->cu.x_split          = x0;
++    lc->cu.y_split          = y0;
++
++    lc->cu.pred_mode        = MODE_INTRA;
++    lc->cu.part_mode        = PART_2Nx2N;
++    lc->cu.intra_split_flag = 0;
++    lc->cu.cu_transquant_bypass_flag = 0;
++    lc->pu.intra_pred_mode[0] = 1;
++    lc->pu.intra_pred_mode[1] = 1;
++    lc->pu.intra_pred_mode[2] = 1;
++    lc->pu.intra_pred_mode[3] = 1;
++
++    if (s->ps.pps->transquant_bypass_enable_flag) {
++        lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
++        if (lc->cu.cu_transquant_bypass_flag)
++            set_deblocking_bypass(s, x0, y0, log2_cb_size);
++    }
++
++    if (s->sh.slice_type != HEVC_SLICE_I) {
++        lc->cu.pred_mode = MODE_INTER;
++        skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
++    }
++
++    if (skip_flag) {
++        lc->cu.pred_mode = MODE_SKIP;
++
++        hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++        intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++
++        if (!s->sh.disable_deblocking_filter_flag)
++            ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++    } else {
++        int pcm_flag = 0;
++
++        if (s->sh.slice_type != HEVC_SLICE_I)
++            lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
++        if (lc->cu.pred_mode != MODE_INTRA ||
++            log2_cb_size == s->ps.sps->log2_min_cb_size) {
++            lc->cu.part_mode        = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
++            lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
++                                      lc->cu.pred_mode == MODE_INTRA;
++        }
++
++        if (lc->cu.pred_mode == MODE_INTRA) {
++            if (lc->cu.part_mode == PART_2Nx2N &&
++                log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size &&  // 0 if not enabled
++                log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
++                ff_hevc_rpi_pcm_flag_decode(lc) != 0)
++            {
++                int ret;
++                pcm_flag = 1;
++                intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++                if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
++                    return ret;
++
++                if (s->ps.sps->pcm.loop_filter_disable_flag)
++                    set_deblocking_bypass(s, x0, y0, log2_cb_size);
++            } else {
++                intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
++            }
++        } else {
++            intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
++            switch (lc->cu.part_mode) {
++            case PART_2Nx2N:
++                hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
++                break;
++            case PART_2NxN:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size / 2, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
++                break;
++            case PART_Nx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
++                lc->cu.x_split = x0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
++                break;
++            case PART_2NxnU:
++                hls_prediction_unit(s, lc, x0, y0,               cb_size, cb_size     / 4, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 4;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
++                break;
++            case PART_2NxnD:
++                hls_prediction_unit(s, lc, x0, y0,                   cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
++                lc->cu.y_split = y0 + cb_size / 4 * 3;
++                hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size     / 4, log2_cb_size, 1, idx);
++                break;
++            case PART_nLx2N:
++                hls_prediction_unit(s, lc, x0,               y0, cb_size     / 4, cb_size, log2_cb_size, 0, idx - 2);
++                lc->cu.x_split = x0 + cb_size / 4;
++                hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_nRx2N:
++                hls_prediction_unit(s, lc, x0,                   y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
++                lc->cu.x_split = x0 + cb_size / 4 * 3;
++                hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size     / 4, cb_size, log2_cb_size, 1, idx - 2);
++                break;
++            case PART_NxN:
++                hls_prediction_unit(s, lc, x0,               y0,               cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
++                lc->cu.x_split = x0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0,               cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
++                lc->cu.y_split = y0 + cb_size / 2;
++                hls_prediction_unit(s, lc, x0,               y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
++                hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
++                break;
++            }
++        }
++
++        if (!pcm_flag) {
++            int rqt_root_cbf = 1;
++
++            if (lc->cu.pred_mode != MODE_INTRA &&
++                !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
++                rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
++            }
++            if (rqt_root_cbf) {
++                const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
++                int ret;
++
++                lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
++                                         s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
++                                         s->ps.sps->max_transform_hierarchy_depth_inter;
++                // transform_tree does deblock_boundary_strengths
++                ret = hls_transform_tree(s, lc, x0, y0,
++                                         log2_cb_size, 0, 0, cbf_c);
++                if (ret < 0)
++                    return ret;
++            } else {
++                if (!s->sh.disable_deblocking_filter_flag)
++                    ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
++            }
++        }
++    }
++
++    // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
++    if (lc->tu.is_cu_qp_delta_wanted)
++        ff_hevc_rpi_set_qPy(s, lc, x0, y0);
++
++    if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++       ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
++        lc->qPy_pred = lc->qp_y;
++    }
++
++    set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
++
++    set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
++
++    return 0;
++}
++
++// Returns:
++//  < 0  Error
++//  0    More data wanted
++//  1    EoSlice / EoPicture
++static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
++                               const int log2_cb_size, const unsigned int cb_depth)
++{
++    const int cb_size    = 1 << log2_cb_size;
++    int ret;
++    int split_cu;
++
++    lc->ct_depth = cb_depth;
++    split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
++    if (x0 + cb_size <= s->ps.sps->width  &&
++        y0 + cb_size <= s->ps.sps->height &&
++        split_cu)
++    {
++        split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
++    }
++
++    // Qp delta (and offset) need to remain wanted if cb_size < min until
++    // a coded block is found so we still initial state at depth 0 (outside
++    // this fn) and only reset here
++    if (s->ps.pps->cu_qp_delta_enabled_flag &&
++        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++    {
++        lc->tu.is_cu_qp_delta_wanted = 1;
++        lc->tu.cu_qp_delta          = 0;
++    }
++    if (s->sh.cu_chroma_qp_offset_enabled_flag &&
++        log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
++    {
++        lc->tu.cu_chroma_qp_offset_wanted = 1;
++    }
++
++    lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
++    lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
++    lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
++
++    if (split_cu) {
++        int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
++        const int cb_size_split = cb_size >> 1;
++        const int x1 = x0 + cb_size_split;
++        const int y1 = y0 + cb_size_split;
++
++        int more_data = 0;
++
++        more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
++        if (more_data < 0)
++            return more_data;
++
++        if (more_data && x1 < s->ps.sps->width) {
++            more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++        if (more_data && x1 < s->ps.sps->width &&
++            y1 < s->ps.sps->height) {
++            more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
++            if (more_data < 0)
++                return more_data;
++        }
++
++        if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
++            ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
++            lc->qPy_pred = lc->qp_y;
++
++        if (more_data)
++            return ((x1 + cb_size_split) < s->ps.sps->width ||
++                    (y1 + cb_size_split) < s->ps.sps->height);
++        else
++            return 0;
++    } else {
++        ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
++        if (ret < 0)
++            return ret;
++        if ((!((x0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (x0 + cb_size >= s->ps.sps->width)) &&
++            (!((y0 + cb_size) %
++               (1 << (s->ps.sps->log2_ctb_size))) ||
++             (y0 + cb_size >= s->ps.sps->height))) {
++            int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
++            return !end_of_slice_flag;
++        } else {
++            return 1;
++        }
++    }
++
++    return 0;  // NEVER
++}
++
++static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++                                 const int x_ctb, const int y_ctb, const int ctb_addr_ts)
++{
++    const unsigned int ctb_size          = 1 << s->ps.sps->log2_ctb_size;
++    const unsigned int ctb_addr_rs       = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr;  // slice_addr = RS addr of start of slice
++    const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++    const unsigned int line_w = s->ps.sps->ctb_width;
++
++    s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
++
++    lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
++    lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
++
++    lc->boundary_flags = 0;
++
++    if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
++        lc->boundary_flags |= BOUNDARY_LEFT_TILE;
++    if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
++        lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
++    if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
++        lc->boundary_flags |= BOUNDARY_UPPER_TILE;
++    if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
++        lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
++
++    // Use line width rather than tile width for addr_in_slice test as
++    // addr_in_slice is in raster units
++
++    lc->ctb_avail =
++        ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
++        ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
++        ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
++            (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
++        ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
++            (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
++    // Down-left never avail at CTB level
++}
++
++
++static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
++        (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
++
++    // Signal
++    if (y > 0) {
++        // Cast away const as progress is held in s, but this really shouldn't confuse anything
++        ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
++    }
++
++    // Job done now
++    // ? Move outside this fn
++    job_free(s->jbc, jb);
++}
++
++// I-pred, transform_and_add for all blocks types done here
++// All ARM
++static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    unsigned int i;
++    HEVCRpiIntraPredEnv * const iap = &jb->intra;
++    const HEVCPredCmd *cmd = iap->cmds;
++
++#if !RPI_WORKER_WAIT_PASS_0
++    rpi_sem_wait(&jb->sem);
++    rpi_cache_flush_execute(jb->rfe);  // Invalidate data set up in pass1
++#endif
++
++    for (i = iap->n; i > 0; i--, cmd++)
++    {
++        switch (cmd->type)
++        {
++            case RPI_PRED_INTRA:
++                s->hpc.intra_pred(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++                break;
++            case RPI_PRED_INTRA_C:
++                s->hpc.intra_pred_c(s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail, cmd->size);
++                break;
++            case RPI_PRED_ADD_RESIDUAL:
++                s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC:
++                s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_U:
++                s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_V:
++                s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
++                break;
++            case RPI_PRED_ADD_RESIDUAL_C:
++                s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
++                break;
++            case RPI_PRED_ADD_DC_U:
++            case RPI_PRED_ADD_DC_V:
++                s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
++                break;
++
++            case RPI_PRED_I_PCM:
++                pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
++                break;
++
++            default:
++                av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
++                abort();
++        }
++    }
++
++    // Mark done
++    iap->n = 0;
++}
++
++
++// Set initial uniform job values & zero ctu_count
++static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
++{
++    unsigned int i;
++    HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
++    HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
++    const HEVCRpiSPS * const sps = s->ps.sps;
++
++    const uint16_t pic_width_y   = sps->width;
++    const uint16_t pic_height_y  = sps->height;
++
++    const uint16_t pic_width_c   = sps->width >> ctx_hshift(s, 1);
++    const uint16_t pic_height_c  = sps->height >> ctx_vshift(s, 1);
++
++    // We expect the pointer to change if we use another sps
++    if (sps != jb->sps)
++    {
++        worker_pic_free_one(jb);
++
++        set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
++        set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
++
++        {
++            const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
++            const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
++            worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
++        }
++
++        jb->sps = sps;
++    }
++
++    jb->waited = 0;
++    jb->ctu_ts_first = ctu_ts_first;
++    jb->ctu_ts_last = -1;
++
++    rpi_inter_pred_reset(cipe);
++    for (i = 0; i < cipe->n; i++) {
++        HEVCRpiInterPredQ * const cp = cipe->q + i;
++        qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
++
++        u->next_src1.x = 0;
++        u->next_src1.y = 0;
++        u->next_src1.base = 0;
++        u->pic_cw = pic_width_c;
++        u->pic_ch = pic_height_c;
++        u->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        u->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        cp->last_l0 = &u->next_src1;
++
++        u->next_fn = 0;
++        u->next_src2.x = 0;
++        u->next_src2.y = 0;
++        u->next_src2.base = 0;
++        cp->last_l1 = &u->next_src2;
++
++        cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
++    }
++
++    rpi_inter_pred_reset(yipe);
++    for (i = 0; i < yipe->n; i++) {
++        HEVCRpiInterPredQ * const yp = yipe->q + i;
++        qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
++
++        y->next_src1.x = 0;
++        y->next_src1.y = 0;
++        y->next_src1.base = 0;
++        y->next_src2.x = 0;
++        y->next_src2.y = 0;
++        y->next_src2.base = 0;
++        y->pic_h = pic_height_y;
++        y->pic_w = pic_width_y;
++        y->stride2 = av_rpi_sand_frame_stride2(s->frame);
++        y->stride1 = av_rpi_sand_frame_stride1(s->frame);
++        y->next_fn = 0;
++        yp->last_l0 = &y->next_src1;
++        yp->last_l1 = &y->next_src2;
++
++        yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
++    }
++
++    jb->last_y8_p = NULL;
++    jb->last_y8_l1 = NULL;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++        jb->progress_req[i] = -1;
++    }
++
++    worker_pic_reset(&jb->coeffs);
++}
++
++
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
++    unsigned int max_block = 0;
++
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++        const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
++
++        if (block_size > max_block)
++            max_block = block_size;
++
++        qpu_mc_link_set(yp->qpu_mc_curr, yp->code_exit);
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_qpu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_qpu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++
++        // Add to mailbox list
++        mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
++        mail[i][1] = yp->code_setup;
++    }
++
++    // We don't need invalidate here as the uniforms aren't changed by the QPU
++    // and leaving them in ARM cache avoids (pointless) pre-reads when writing
++    // new values which seems to give us a small performance advantage
++    //
++    // In most cases we will not have a completely packed set of uniforms and as
++    // we have a 2d invalidate we writeback all uniform Qs to the depth of the
++    // fullest
++    rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
++                                  (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
++                                  ipe->n, ipe->max_fill + ipe->min_gap);
++    vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
++
++    return 1;
++}
++#endif
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
++                                     const vpu_qpu_job_h vqj,
++                                     rpi_cache_flush_env_t * const rfe,
++                                     HEVCRpiInterPredEnv * const ipe)
++{
++    unsigned int i;
++    if (!ipe->used) {
++        return 0;
++    }
++
++    if (ipe->curr != 0) {
++        rpi_inter_pred_sync(ipe);
++    }
++
++    // Add final commands to Q
++    for(i = 0; i != ipe->n; ++i) {
++        HEVCRpiInterPredQ * const yp = ipe->q + i;
++        qpu_mc_src_t *const p0 = yp->last_l0;
++        qpu_mc_src_t *const p1 = yp->last_l1;
++
++        yp->qpu_mc_curr->data[-1] = yp->code_exit;
++
++        // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
++        p0->x = MC_DUMMY_X;
++        p0->y = MC_DUMMY_Y;
++        p0->base = s->qpu_dummy_frame_emu;
++        p1->x = MC_DUMMY_X;
++        p1->y = MC_DUMMY_Y;
++        p1->base = s->qpu_dummy_frame_emu;
++
++        yp->last_l0 = NULL;
++        yp->last_l1 = NULL;
++    }
++
++    return 1;
++}
++#endif
++
++
++#if RPI_QPU_EMU_Y
++#define mc_terminate_add_y mc_terminate_add_emu
++#else
++#define mc_terminate_add_y mc_terminate_add_qpu
++#endif
++#if RPI_QPU_EMU_C
++#define mc_terminate_add_c mc_terminate_add_emu
++#else
++#define mc_terminate_add_c mc_terminate_add_qpu
++#endif
++
++
++static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
++{
++    rpi_cache_buf_t cbuf;
++    rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++    rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
++    rpi_cache_flush_finish(rfe);
++}
++
++static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
++    const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
++    const unsigned int ctb_width = s->ps.sps->ctb_width;
++    RpiBlk *const bounds = &jb->bounds;
++    av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
++    bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
++    bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
++    bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
++
++    bounds->w = FFMIN(bounds->w, s->ps.sps->width - bounds->x);
++    bounds->h = FFMIN(bounds->h, s->ps.sps->height - bounds->y);
++}
++
++#if RPI_PASSES == 2
++static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    // Perform intra prediction and residual reconstruction
++    rpi_execute_pred_cmds(s, jb);
++
++    // Perform deblocking for CTBs in this row
++    rpi_execute_dblk_cmds(s, jb);
++}
++#endif
++
++// Core execution tasks
++static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
++{
++    int pred_y, pred_c;
++    vpu_qpu_job_env_t qvbuf;
++    const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
++#if RPI_WORKER_WAIT_PASS_0
++    int do_wait;
++#endif
++
++    {
++        const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
++        if (cf->s[3].n + cf->s[2].n != 0)
++        {
++            const unsigned int csize = sizeof(cf->s[3].buf[0]);
++            const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
++            unsigned int n16 = (cf->s[2].n >> 8);
++            unsigned int n32 = (cf->s[3].n >> 10);
++#if RPI_COMPRESS_COEFFS
++            if (cf->s[2].packed) {
++                n16 = n16 | (n16<<16);
++            } else {
++                const unsigned int npack16 = (cf->s[2].packed_n>>8);
++                n16 = n16 | (npack16<<16);
++            }
++            if (cf->s[3].packed) {
++                n32 = n32 | (n32<<16);
++            } else {
++                const unsigned int npack32 = (cf->s[3].packed_n>>10);
++                n32 = n32 | (npack32<<16);
++            }
++#endif
++            vpu_qpu_job_add_vpu(vqj,
++                vpu_get_fn(s->ps.sps->bit_depth),
++                vpu_get_constants(),
++                cf->gptr.vc,
++                n16,
++                cf->gptr.vc + offset32,
++                n32,
++                0);
++
++            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
++            rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
++        }
++    }
++
++    pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
++
++// We could take a sync here and try to locally overlap QPU processing with ARM
++// but testing showed a slightly negative benefit with noticable extra complexity
++
++    pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
++
++    // Returns 0 if nothing to do, 1 if sync added
++#if RPI_WORKER_WAIT_PASS_0
++    do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
++#else
++    if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
++        sem_post(&jb->sem);
++#endif
++
++    rpi_cache_flush_execute(jb->rfe);
++
++    // Await progress as required
++    // jb->waited will only be clear if we have already tested the progress values
++    // (in worker_submit_job) and found we don't have to wait
++    if (jb->waited)
++    {
++        unsigned int i;
++        for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
++            if (jb->progress_req[i] >= 0) {
++                ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
++            }
++        }
++    }
++
++    vpu_qpu_job_finish(vqj);
++
++    // We always work on a rectangular block
++    if (pred_y || pred_c)
++    {
++        rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
++                                        jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
++                                        ctx_vshift(s, 1), pred_y, pred_c);
++    }
++
++    // If we have emulated VPU ops - do it here
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    if (av_rpi_is_sand8_frame(s->frame))
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
++#endif
++    }
++    else
++    {
++#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
++#elif RPI_QPU_EMU_Y
++        ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
++#else
++        ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
++#endif
++    }
++#endif
++
++#if RPI_WORKER_WAIT_PASS_0
++    if (do_wait)
++        rpi_sem_wait(&jb->sem);
++    rpi_cache_flush_execute(jb->rfe);
++#endif
++}
++
++
++static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
++{
++    av_freep(&ipe->q);
++    gpu_free(&ipe->gptr);
++}
++
++static HEVCRpiJob * job_new(void)
++{
++    HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
++
++    if (jb == NULL)
++        return NULL;
++
++    sem_init(&jb->sem, 0, 0);
++    jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
++    ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
++
++    jb->intra.n = 0;
++    if ((jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS)) == NULL)
++        goto fail1;
++
++    // * Sizeof the union structure might be overkill but at the moment it
++    //   is correct (it certainly isn't going to be too small)
++    // Set max fill to slack/2 from the end of the Q
++    // If we exceed this in any Q then we will schedule by size (which should
++    // mean that we never use that Q again part from syncs)
++    // * Given how agressive the overflow resonse is we could maybe put the
++    //   threshold even nearer the end, but I don't expect us to ever hit
++    //   it on any real stream anyway.
++
++    if (rpi_inter_pred_alloc(&jb->chroma_ip,
++                         QPU_N_MAX, QPU_N_GRP,
++                         QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
++                         QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2) != 0)
++        goto fail2;
++    if (rpi_inter_pred_alloc(&jb->luma_ip,
++                         QPU_N_MAX,  QPU_N_GRP,
++                         QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
++                         QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2) != 0)
++        goto fail3;
++
++    return jb;
++
++fail3:
++    rpi_free_inter_pred(&jb->luma_ip);
++fail2:
++    av_freep(&jb->intra.cmds);
++fail1:
++    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++    rpi_cache_flush_finish(jb->rfe);
++    sem_destroy(&jb->sem);
++    return NULL;
++}
++
++static void job_delete(HEVCRpiJob * const jb)
++{
++    worker_pic_free_one(jb);
++    ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
++    rpi_free_inter_pred(&jb->chroma_ip);
++    rpi_free_inter_pred(&jb->luma_ip);
++    av_freep(&jb->intra.cmds);
++    rpi_cache_flush_finish(jb->rfe);  // Not really needed - should do nothing
++    sem_destroy(&jb->sem);
++    av_free(jb);
++}
++
++static void jbg_delete(HEVCRpiJobGlobal * const jbg)
++{
++    HEVCRpiJob * jb;
++
++    if (jbg == NULL)
++        return;
++
++    jb = jbg->free1;
++    while (jb != NULL)
++    {
++        HEVCRpiJob * const jb2 = jb;
++        jb = jb2->next;
++        job_delete(jb2);
++    }
++
++    pthread_mutex_destroy(&jbg->lock);
++    av_free(jbg);
++}
++
++static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
++{
++    HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
++    if (jbg == NULL)
++        return NULL;
++
++    pthread_mutex_init(&jbg->lock, NULL);
++
++    while (job_count-- != 0)
++    {
++        HEVCRpiJob * const jb = job_new();
++        if (jb == NULL)
++            goto fail;
++
++        jb->next = jbg->free1;
++        jbg->free1 = jb;
++    }
++
++    return jbg;
++
++fail:
++    jbg_delete(jbg);
++    return NULL;
++}
++
++static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
++{
++    HEVCRpiJobGlobal * jbg;
++
++    if (jbc == NULL)
++        return;
++
++    jbg = jbc->jbg;
++
++    if (jbc->jb1 != NULL)
++        job_delete(jbc->jb1);
++
++    pthread_mutex_destroy(&jbc->in_lock);
++    sem_destroy(&jbc->sem_out);
++    av_free(jbc);
++
++    // Deref the global job context
++    if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
++        jbg_delete(jbg);
++}
++
++static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
++{
++    HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
++
++    if (jbc == NULL)
++        return NULL;
++
++    jbc->jbg = jbg;
++    atomic_fetch_add(&jbg->ref_count, 1);
++
++    sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
++    pthread_mutex_init(&jbc->in_lock, NULL);
++
++    if ((jbc->jb1 = job_new()) == NULL)
++        goto fail;
++    jbc->jb1->jbc_local = jbc;
++
++    return jbc;
++
++fail:
++    rpi_job_ctl_delete(jbc);
++    return NULL;
++}
++
++
++
++static av_cold void hevc_init_worker(HEVCRpiContext * const s)
++{
++#if RPI_PASSES == 2
++    pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
++#elif RPI_PASSES == 3
++    pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
++    pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
++#else
++#error Passes confused
++#endif
++    pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
++
++    pass_queues_start_all(s);
++}
++
++static av_cold void hevc_exit_worker(HEVCRpiContext *s)
++{
++    pass_queues_term_all(s);
++
++    pass_queues_kill_all(s);
++
++    rpi_job_ctl_delete(s->jbc);
++    s->jbc = NULL;
++}
++
++
++static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
++{
++    const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
++    const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
++
++    // Check for obvious disasters
++    if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
++        av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // If dependant then ctb_addr_ts != 0 from previous check
++    if (s->sh.dependent_slice_segment_flag) {
++        int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
++        if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
++            av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        tile_id + s->sh.num_entry_point_offsets >= tiles)
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    // Tiled stuff must start at start of tile if it has multiple entry points
++    if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
++        s->sh.num_entry_point_offsets != 0 &&
++        ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
++    {
++        av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
++        return AVERROR_INVALIDDATA;
++    }
++
++    ff_hevc_rpi_cabac_init_decoder(lc);
++
++    // Setup any required decode vars
++    lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
++
++//    printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
++    lc->qp_y = s->sh.slice_qp;
++
++    // General setup
++    lc->bt_line_no = 0;
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    const GetBitContext * const gb = &s->HEVClc->gb;
++    RpiSliceHeader * const sh = &s->sh;
++    int i, j;
++
++    const unsigned int length = nal->size;
++    unsigned int offset = ((gb->index) >> 3) + 1;  // We have a bit & align still to come = +1 byte
++    unsigned int cmpt;
++    unsigned int startheader;
++
++    if (sh->num_entry_point_offsets == 0) {
++        s->data = NULL;
++        return 0;
++    }
++
++    // offset in slice header includes emulation prevention bytes.
++    // Unfortunately those have been removed by the time we get here so we
++    // have to compensate.  The nal layer keeps a track of where they were.
++    for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
++        if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++            startheader--;
++            cmpt++;
++        }
++    }
++
++    for (i = 1; i < sh->num_entry_point_offsets; i++) {
++        offset += (sh->entry_point_offset[i - 1] - cmpt);
++        for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
++            if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
++                startheader--;
++                cmpt++;
++            }
++        }
++        if (sh->entry_point_offset[i] <= cmpt) {
++            av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
++            return AVERROR_INVALIDDATA;
++        }
++        sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
++        sh->offset[i - 1] = offset;
++    }
++
++    offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
++    if (length < offset) {
++        av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
++        return AVERROR_INVALIDDATA;
++    }
++    sh->size[sh->num_entry_point_offsets - 1] = length - offset;
++    sh->offset[sh->num_entry_point_offsets - 1] = offset;
++
++    // Remember data start pointer as we won't have nal later
++    s->data = nal->data;
++    return 0;
++}
++
++
++// Return
++// < 0   Error
++// 0     OK
++//
++// jb->ctu_ts_last < 0       Job still filling
++// jb->ctu_ts_last >= 0      Job ready
++
++static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
++{
++    const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
++    const unsigned int ctb_size = (1 << log2_ctb_size);
++    HEVCRpiJob * const jb = lc->jb0;
++    int more_data = 1;
++    unsigned int ctb_addr_ts = lc->ts;
++    unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++    unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
++    const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
++
++    lc->unit_done = 0;
++
++    while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
++    {
++        int q_full;
++        const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
++
++        hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
++
++        ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
++
++        hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
++
++        s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
++        s->deblock[ctb_addr_rs].tc_offset   = s->sh.tc_offset;
++        s->filter_slice_edges[ctb_addr_rs]  = s->sh.slice_loop_filter_across_slices_enabled_flag;
++
++        // Zap stashes if navail
++        if ((lc->ctb_avail & AVAIL_U) == 0)
++            zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
++        if ((lc->ctb_avail & AVAIL_L) == 0)
++        {
++            memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
++            zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
++        }
++#if MVF_STASH_WIDTH > 64
++        // Restore left mvf stash at start of tile if not at start of line
++        if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
++        {
++            unsigned int i;
++            HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
++            const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++            for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++            {
++                *dst = *src++;
++                dst += MVF_STASH_WIDTH_PU;
++            }
++        }
++#endif
++
++        // Set initial tu states
++        lc->tu.cu_qp_delta = 0;
++        lc->tu.is_cu_qp_delta_wanted = 0;
++        lc->tu.cu_chroma_qp_offset_wanted = 0;
++
++        // Decode
++        more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
++
++        if (ff_hevc_rpi_cabac_overflow(lc))
++        {
++            av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
++            more_data = AVERROR_INVALIDDATA;
++        }
++
++        if (more_data < 0) {
++            s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN;  // Mark slice as broken
++            return more_data;
++        }
++
++        if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
++             (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
++        {
++            if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
++                ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
++            {
++                av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
++                return -1;
++            }
++        }
++
++        // --- Post CTB processing
++
++        // Stash rpl top/left for deblock that needs to remember such things cross-slice
++        s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
++        s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
++
++        if (!s->is_irap)
++        {
++            // Copy MVF up to up-left & stash to up
++            {
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
++                HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
++
++    //            printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
++
++                lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
++                memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
++            }
++            // Stash sideways if end of tile line but not end of line (no point)
++            // ** Could/should do this @ end of fn
++#if MVF_STASH_WIDTH > 64
++            if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
++#endif
++            {
++                unsigned int i;
++                const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
++                HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
++                for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
++                {
++                    *dst++ = *src;
++                    src += MVF_STASH_WIDTH_PU;
++                }
++            }
++        }
++
++        if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
++            ff_hevc_rpi_save_states(s, lc);
++
++        // Report progress so we can use our MVs in other frames
++        if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
++            ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
++
++        // End of line || End of tile line || End of tile
++        // (EoL covers end of frame for our purposes here)
++        q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
++
++        // Allocate QPU chunks on fixed size 64 pel boundries rather than
++        // whatever ctb_size is today.
++        // * We might quite like to continue to 64 pel vertical too but that
++        //   currently confuses WPP
++        if (((x_ctb + ctb_size) & 63) == 0 || q_full)
++        {
++            int overflow = 0;
++            if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
++                overflow = 1;
++            if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
++                overflow = 1;
++            if (overflow)
++            {
++                // * This is very annoying (and slow) to cope with in WPP so
++                //   we treat it as an error there (no known stream triggers this
++                //   with the current buffer sizes).  Non-wpp should cope fine.
++                av_log(s->avctx, AV_LOG_WARNING,  "%s: Q full before EoL\n", __func__);
++                q_full = 1;
++            }
++        }
++
++        // Inc TS to next.
++        ctb_addr_ts++;
++        ctb_addr_rs++;
++        x_ctb += ctb_size;
++
++        if (q_full)
++        {
++            // Do job
++            // Prep for submission
++            jb->ctu_ts_last = ctb_addr_ts - 1;  // Was pre-inced
++            job_gen_bounds(s, jb);
++            break;
++        }
++
++        // If max_blocks started as 0 then this will never be true
++        if (--max_blocks == 0)
++            break;
++    }
++
++    lc->unit_done = (more_data <= 0);
++    lc->ts = ctb_addr_ts;
++    return 0;
++}
++
++static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
++{
++    lc->context = s;
++    lc->jb0 = NULL;
++    lc->lc_n = n;
++    lc->bt_terminate = 0;
++    lc->bt_psem_out = NULL;
++    sem_init(&lc->bt_sem_in, 0, 0);
++}
++
++#define TRACE_WPP 0
++#if RPI_EXTRA_BIT_THREADS > 0
++static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
++{
++    unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
++    return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
++}
++
++// Move local context parameters from an aux bit thread back to the main
++// thread at the end of a slice as processing is going to continue there.
++static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
++{
++    if (src_lc == dst_lc) {
++        return;
++    }
++
++    // Move the job
++    // We will still have an active job if the final line terminates early
++    // Dest should always be null by now
++    av_assert1(dst_lc->jb0 == NULL);
++    dst_lc->jb0 = src_lc->jb0;
++    src_lc->jb0 = NULL;
++
++    // Always need to store where we are in the bitstream
++    dst_lc->ts = src_lc->ts;
++    dst_lc->gb = src_lc->gb;
++    // Cabac init request will be built at start of next slice
++
++    // Need to store context if we might have a dependent seg
++    if (is_dep)
++    {
++        dst_lc->qPy_pred = src_lc->qPy_pred;
++        memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
++        memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
++        memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
++    }
++}
++
++static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
++{
++    rpi_sem_wait(&lc->bt_sem_in);
++    return lc->bt_terminate;
++}
++
++// Do one WPP line
++// Will not work correctly over horizontal tile boundries - vertical should be OK
++static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
++{
++    const int is_tile = lc->bt_is_tile;
++    const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
++    const unsigned int line = lc->bt_line_no;
++    const unsigned int line_inc = lc->bt_line_inc;
++    const int is_last = (line >= lc->bt_last_line);
++
++    const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
++    const unsigned int ts_next =
++        line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
++            INT_MAX :
++        is_tile ?
++            s->ps.pps->tile_pos_ts[tile_id + line_inc] :
++            lc->ts + lc->bt_line_width * line_inc;
++    // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
++    const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
++    unsigned int ts_prev;
++    int loop_n = 0;
++    int err = 0;
++
++    av_assert1(line <= s->sh.num_entry_point_offsets);
++
++#if TRACE_WPP
++    printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
++           lc->lc_n,  is_tile ? "Tile" : "WPP", tile_id,
++           line, lc->bt_last_line, s->sh.num_entry_point_offsets,
++           lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
++#endif
++    if (line != 0)
++    {
++        const uint8_t * const data = s->data + s->sh.offset[line - 1];
++        const unsigned int len = s->sh.size[line - 1];
++        if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
++            return err;
++
++        ff_init_cabac_decoder(&lc->cc, data, len);
++    }
++
++    // We should never be processing a dependent slice here so reset is good
++    // ?? These probably shouldn't be needed (as they should be set by later
++    //    logic) but do seem to be required
++    lc->qp_y = s->sh.slice_qp;
++
++    do
++    {
++        if (!is_last && loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++        // The wait for loop_n == 0 has been done in bit_thread
++        if (!is_first && loop_n != 0)
++        {
++#if TRACE_WPP
++            printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
++#endif
++            if (wait_bt_sem_in(lc) != 0)
++                return AVERROR_EXIT;
++        }
++
++#if TRACE_WPP
++        {
++            int n;
++            sem_getvalue(&lc->bt_sem_in, &n);
++            printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
++        }
++#endif
++
++        ts_prev = lc->ts;
++
++        // If we have had an error - do no further decode but do continue
++        // moving signals around so the other threads continue to operate
++        // correctly (or at least as correctly as they can with this line missing)
++        //
++        // Errors in WPP/Tile are less fatal than normal as we have a good idea
++        // of how to restart on the next line so there is no need to give up totally
++        if (err != 0)
++        {
++            lc->unit_done = 0;
++            lc->ts += partial_size;
++        }
++        else
++        {
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, partial_size)) < 0 ||
++                (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
++            {
++                if (err == 0) {
++                    av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
++                    err = AVERROR_INVALIDDATA;
++                }
++                worker_free(s, lc);
++                lc->ts = ts_prev + partial_size;  // Pretend we did all that
++                lc->unit_done = 0;
++            }
++            else if (is_tile)
++            {
++                worker_submit_job(s, lc);
++            }
++        }
++
++        ++loop_n;
++    } while (lc->ts < ts_eol && !lc->unit_done);
++
++    // If we are on the last line & we didn't get a whole line we must wait for
++    // and sink the sem_posts from the line above / tile to the left.
++    while ((ts_prev += partial_size) < ts_eol)
++    {
++#if TRACE_WPP
++        printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
++#endif
++        if (wait_bt_sem_in(lc) != 0)
++            return AVERROR_EXIT;
++    }
++
++    lc->bt_line_no += line_inc;
++
++    if (!is_tile && err == 0)
++        worker_submit_job(s, lc);
++
++    if (!is_last) {
++        lc->ts = ts_next;
++
++#if TRACE_WPP
++        printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++        sem_post(lc->bt_psem_out);
++        if (loop_n > 1) {
++#if TRACE_WPP
++            printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
++#endif
++            sem_post(lc->bt_psem_out);
++        }
++    }
++    else
++    {
++        movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag);  // * & not EoT
++#if MVF_STASH_WIDTH > 64
++        // Horrid calculations to work out what we want but luckily this should almost never execute
++        // **** Move to movlc
++        if (!s->is_irap)
++        {
++            const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
++            if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
++            {
++                const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
++                unsigned int i;
++                const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++                HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
++
++                for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
++                {
++                    *d_mvf = *s_mvf;
++                    d_mvf += MVF_STASH_WIDTH_PU;
++                    s_mvf += MVF_STASH_WIDTH_PU;
++                }
++
++            }
++        }
++#endif
++        // When all done poke the thread 0 sem_in one final time
++#if TRACE_WPP
++        printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
++#endif
++        sem_post(&s->HEVClcList[0]->bt_sem_in);
++    }
++
++#if TRACE_WPP
++    printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
++#endif
++    return err;
++}
++
++static void wpp_setup_lcs(HEVCRpiContext * const s)
++{
++    unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int line_width = line_ts_width(s, ts);
++
++    for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        lc->ts = ts;
++        lc->bt_is_tile = 0;
++        lc->bt_line_no = i;
++        lc->bt_line_width = line_width;
++        lc->bt_last_line = s->sh.num_entry_point_offsets;
++        lc->bt_line_inc = RPI_BIT_THREADS;
++        ts += line_width;
++    }
++}
++
++
++// Can only process tile single row at once
++static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
++{
++    const HEVCRpiPPS * const pps = s->ps.pps;
++    const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
++    const unsigned int tile0 = pps->tile_id[ts0];
++    const unsigned int col0 = tile0 % pps->num_tile_columns;
++
++    const unsigned int col = (slice_row == 0) ? col0 : 0;
++    unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
++    const unsigned int last_line = FFMIN(
++        line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
++
++    const unsigned int par =
++        FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
++#if TRACE_WPP
++    printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
++           pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
++#endif
++    for (unsigned int i = 0; i != par; ++i, ++line)
++    {
++        HEVCRpiLocalContext * const lc = s->HEVClcList[i];
++        const unsigned int tile = tile0 + line;
++
++        lc->ts = pps->tile_pos_ts[tile];
++        lc->bt_line_no = line;
++        lc->bt_is_tile = 1;
++        lc->bt_line_width = line_ts_width(s, lc->ts);
++        lc->bt_last_line = last_line;
++        lc->bt_line_inc = par;
++    }
++}
++
++
++static void * bit_thread(void * v)
++{
++    HEVCRpiLocalContext * const lc = v;
++    HEVCRpiContext *const s = lc->context;
++
++    while (wait_bt_sem_in(lc) == 0)
++    {
++        int err;
++
++        if ((err = rpi_run_one_line(s, lc, 0)) < 0) {  // Never first tile/wpp
++            if (lc->bt_terminate) {
++                av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
++                break;
++            }
++            av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
++        }
++    }
++
++    return NULL;
++}
++
++static int bit_threads_start(HEVCRpiContext * const s)
++{
++    if (s->bt_started)
++        return 0;
++
++    for (int i = 1; i < RPI_BIT_THREADS; ++i)
++    {
++        // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
++        if (s->HEVClcList[i] == NULL) {
++            if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
++                return -1;
++        }
++
++        bt_lc_init(s, s->HEVClcList[i], i);
++        job_lc_init(s->HEVClcList[i]);
++    }
++
++    // Link the sems in a circle
++    for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
++        s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
++    s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
++
++    // Init all lc before starting any threads
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
++            return -1;
++    }
++
++    s->bt_started = 1;
++    return 0;
++}
++
++static int bit_threads_kill(HEVCRpiContext * const s)
++{
++    if (!s->bt_started)
++        return 0;
++    s->bt_started = 0;
++
++    for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
++    {
++        HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
++        if (lc == NULL)
++            break;
++
++        lc->bt_terminate = 1;
++        sem_post(&lc->bt_sem_in);
++        pthread_join(s->bit_threads[i], NULL);
++
++        sem_destroy(&lc->bt_sem_in);
++        job_lc_kill(lc);
++    }
++    return 0;
++}
++#endif
++
++
++// If we are at EoT and the row is shorter than the number of jobs
++// we can Q we have to wait for it finish otherwise we risk cache/QPU
++// disasters
++static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
++{
++    return
++        s->ps.pps->tile_wpp_inter_disable >= 2 &&
++        s->sh.slice_type != HEVC_SLICE_I &&
++        n >= 0 &&
++        (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
++}
++
++static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
++{
++    HEVCRpiContext * const s  = avctxt->priv_data;
++    HEVCRpiLocalContext * const lc = s->HEVClc;
++    int err;
++
++    // Start of slice
++    if ((err = slice_start(s, lc)) != 0)
++        return err;
++
++#if RPI_EXTRA_BIT_THREADS > 0
++
++    if (s->sh.offload_tiles)
++    {
++        unsigned int slice_row = 0;
++
++#if TRACE_WPP
++        printf("%s: Do Tiles\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        do
++        {
++            // Reset lc lines etc.
++            tile_one_row_setup_lcs(s, slice_row);
++
++#if TRACE_WPP
++            printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++            printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
++                   __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
++#endif
++
++            while (lc->bt_line_no <= lc->bt_last_line) {
++                rpi_sem_wait(&lc->bt_sem_in);
++                rpi_run_one_line(s, lc, 0);
++            }
++#if TRACE_WPP
++            printf("%s: Done body\n", __func__);
++#endif
++
++            // Wait for everything else to finish
++            rpi_sem_wait(&lc->bt_sem_in);
++
++            ++slice_row;
++        } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
++
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else if (s->sh.offload_wpp)
++    {
++#if TRACE_WPP
++        printf("%s: Do WPP\n", __func__);
++#endif
++        // Generate & start extra bit threads if they aren't already running
++        bit_threads_start(s);
++
++        // Reset lc lines etc.
++        wpp_setup_lcs(s);
++
++        rpi_run_one_line(s, lc, 1);  // Kicks off the other threads
++#if TRACE_WPP
++        printf("%s: Done 1st\n", __func__);
++#endif
++
++        while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
++            rpi_sem_wait(&lc->bt_sem_in);
++            rpi_run_one_line(s, lc, 0);
++        }
++#if TRACE_WPP
++        printf("%s: Done body\n", __func__);
++#endif
++
++        // Wait for everything else to finish
++        rpi_sem_wait(&lc->bt_sem_in);
++
++#if TRACE_WPP
++        printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++    else
++#endif
++    {
++#if TRACE_WPP
++        printf("%s: Single start: ts=%d\n", __func__, lc->ts);
++#endif
++        // Single bit thread
++        do {
++            // Make sure we have space to prepare the next job
++            worker_pass0_ready(s, lc);
++
++            if ((err = fill_job(s, lc, 0)) < 0)
++                goto fail;
++
++            worker_submit_job(s, lc);
++
++            if (tile_needs_wait(s, lc->ts - 1))
++                worker_wait(s, lc);
++
++        } while (!lc->unit_done);
++
++#if TRACE_WPP
++        printf("%s: Single end: ts=%d\n", __func__, lc->ts);
++#endif
++    }
++
++    // If we have reached the end of the frame or
++    // then wait for the worker to finish all its jobs
++    if (lc->ts >= s->ps.sps->ctb_size)
++        worker_wait(s, lc);
++
++#if RPI_TSTATS
++    {
++        HEVCRpiStats *const ts = &s->tstats;
++
++        printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n    B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
++               ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
++               ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
++               ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
++               ts->y_pred2_hgt16, ts->y_pred2_hle16);
++        memset(ts, 0, sizeof(*ts));
++    }
++#endif
++
++    return lc->ts;
++
++fail:
++    // Cleanup
++    av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
++    // Free our job & wait for temination
++    worker_free(s, lc);
++    worker_wait(s, lc);
++    return err;
++}
++
++
++static void set_no_backward_pred(HEVCRpiContext * const s)
++{
++    int i, j;
++    const RefPicList *const refPicList = s->refPicList;
++
++    s->no_backward_pred_flag = 0;
++    if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
++        return;
++
++    for (j = 0; j < 2; j++) {
++        for (i = 0; i < refPicList[j].nb_refs; i++) {
++            if (refPicList[j].list[i] > s->poc) {
++                s->no_backward_pred_flag = 1;
++                return;
++            }
++        }
++    }
++}
++
++static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
++{
++    int err;
++    if ((err = gen_entry_points(s, nal)) < 0)
++        return err;
++
++    set_no_backward_pred(s);
++
++    return rpi_decode_entry(s->avctx, NULL);
++}
++
++static int set_side_data(HEVCRpiContext *s)
++{
++    AVFrame *out = s->ref->frame;
++
++    if (s->sei.frame_packing.present &&
++        s->sei.frame_packing.arrangement_type >= 3 &&
++        s->sei.frame_packing.arrangement_type <= 5 &&
++        s->sei.frame_packing.content_interpretation_type > 0 &&
++        s->sei.frame_packing.content_interpretation_type < 3) {
++        AVStereo3D *stereo = av_stereo3d_create_side_data(out);
++        if (!stereo)
++            return AVERROR(ENOMEM);
++
++        switch (s->sei.frame_packing.arrangement_type) {
++        case 3:
++            if (s->sei.frame_packing.quincunx_subsampling)
++                stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
++            else
++                stereo->type = AV_STEREO3D_SIDEBYSIDE;
++            break;
++        case 4:
++            stereo->type = AV_STEREO3D_TOPBOTTOM;
++            break;
++        case 5:
++            stereo->type = AV_STEREO3D_FRAMESEQUENCE;
++            break;
++        }
++
++        if (s->sei.frame_packing.content_interpretation_type == 2)
++            stereo->flags = AV_STEREO3D_FLAG_INVERT;
++
++        if (s->sei.frame_packing.arrangement_type == 5) {
++            if (s->sei.frame_packing.current_frame_is_frame0_flag)
++                stereo->view = AV_STEREO3D_VIEW_LEFT;
++            else
++                stereo->view = AV_STEREO3D_VIEW_RIGHT;
++        }
++    }
++
++    if (s->sei.display_orientation.present &&
++        (s->sei.display_orientation.anticlockwise_rotation ||
++         s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
++        double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
++        AVFrameSideData *rotation = av_frame_new_side_data(out,
++                                                           AV_FRAME_DATA_DISPLAYMATRIX,
++                                                           sizeof(int32_t) * 9);
++        if (!rotation)
++            return AVERROR(ENOMEM);
++
++        av_display_rotation_set((int32_t *)rotation->data, angle);
++        av_display_matrix_flip((int32_t *)rotation->data,
++                               s->sei.display_orientation.hflip,
++                               s->sei.display_orientation.vflip);
++    }
++
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.mastering_display.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.mastering_display.present--;
++    }
++    if (s->sei.mastering_display.present) {
++        // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
++        const int mapping[3] = {2, 0, 1};
++        const int chroma_den = 50000;
++        const int luma_den = 10000;
++        int i;
++        AVMasteringDisplayMetadata *metadata =
++            av_mastering_display_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++
++        for (i = 0; i < 3; i++) {
++            const int j = mapping[i];
++            metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
++            metadata->display_primaries[i][0].den = chroma_den;
++            metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
++            metadata->display_primaries[i][1].den = chroma_den;
++        }
++        metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
++        metadata->white_point[0].den = chroma_den;
++        metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
++        metadata->white_point[1].den = chroma_den;
++
++        metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
++        metadata->max_luminance.den = luma_den;
++        metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
++        metadata->min_luminance.den = luma_den;
++        metadata->has_luminance = 1;
++        metadata->has_primaries = 1;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
++               av_q2d(metadata->display_primaries[0][0]),
++               av_q2d(metadata->display_primaries[0][1]),
++               av_q2d(metadata->display_primaries[1][0]),
++               av_q2d(metadata->display_primaries[1][1]),
++               av_q2d(metadata->display_primaries[2][0]),
++               av_q2d(metadata->display_primaries[2][1]),
++               av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
++        av_log(s->avctx, AV_LOG_DEBUG,
++               "min_luminance=%f, max_luminance=%f\n",
++               av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
++    }
++    // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
++    // so the side data persists for the entire coded video sequence.
++    if (s->sei.content_light.present > 0 &&
++        IS_IRAP(s) && s->no_rasl_output_flag) {
++        s->sei.content_light.present--;
++    }
++    if (s->sei.content_light.present) {
++        AVContentLightMetadata *metadata =
++            av_content_light_metadata_create_side_data(out);
++        if (!metadata)
++            return AVERROR(ENOMEM);
++        metadata->MaxCLL  = s->sei.content_light.max_content_light_level;
++        metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
++
++        av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
++        av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
++               metadata->MaxCLL, metadata->MaxFALL);
++    }
++
++    if (s->sei.a53_caption.a53_caption) {
++        AVFrameSideData* sd = av_frame_new_side_data(out,
++                                                     AV_FRAME_DATA_A53_CC,
++                                                     s->sei.a53_caption.a53_caption_size);
++        if (sd)
++            memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
++        av_freep(&s->sei.a53_caption.a53_caption);
++        s->sei.a53_caption.a53_caption_size = 0;
++        s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
++    }
++
++    if (s->sei.alternative_transfer.present &&
++        av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
++        s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
++        s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
++    }
++
++    return 0;
++}
++
++static int hevc_frame_start(HEVCRpiContext * const s)
++{
++    int ret;
++
++    memset(s->bs_horizontal, 0, s->bs_size * 2);  // Does V too
++    memset(s->is_pcm,        0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++    memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
++
++    // Only need to remember intra for CIP
++    if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
++        s->is_intra = NULL;
++    else
++    {
++        s->is_intra = s->is_intra_store;
++        memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
++    }
++
++    s->is_decoded        = 0;
++    s->first_nal_type    = s->nal_unit_type;
++
++    s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
++
++    if (s->pkt.nb_nals > s->rpl_tab_size)
++    {
++        // In most cases it will be faster to free & realloc as that doesn't
++        // require (an unwanted) copy
++        av_freep(&s->rpl_tab);
++        s->rpl_tab_size = 0;
++        if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
++            goto fail;
++        s->rpl_tab_size = s->pkt.nb_nals;
++    }
++    memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
++
++    ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
++    if (ret < 0)
++        goto fail;
++
++    // Resize rpl_tab to max that we might want
++    ret = ff_hevc_rpi_frame_rps(s);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
++        goto fail;
++    }
++
++    s->ref->frame->key_frame = IS_IRAP(s);
++
++    ret = set_side_data(s);
++    if (ret < 0)
++        goto fail;
++
++    s->frame->pict_type = 3 - s->sh.slice_type;
++
++    if (!IS_IRAP(s))
++        ff_hevc_rpi_bump_frame(s);
++
++    av_frame_unref(s->output_frame);
++    ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
++    if (ret < 0)
++        goto fail;
++
++    ff_thread_finish_setup(s->avctx);
++
++    return 0;
++
++fail:
++    if (s->ref)
++        ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++    s->ref = NULL;
++    return ret;
++}
++
++static inline int is_non_ref_unit_type(const unsigned int nal_unit_type)
++{
++    // From Table 7-1
++    return (nal_unit_type & ~0xe) == 0;  // True for 0, 2, 4, 6, 8, 10, 12, 14
++}
++
++static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
++{
++    GetBitContext * const gb    = &s->HEVClc->gb;
++    int ctb_addr_ts, ret;
++
++    *gb              = nal->gb;
++    s->nal_unit_type = nal->type;
++    s->temporal_id   = nal->temporal_id;
++
++    switch (s->nal_unit_type) {
++    case HEVC_NAL_VPS:
++        ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SPS:
++        ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
++                                     s->apply_defdispwin);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_PPS:
++        ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_SEI_PREFIX:
++    case HEVC_NAL_SEI_SUFFIX:
++        ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
++        if (ret < 0)
++            goto fail;
++        break;
++    case HEVC_NAL_TRAIL_R:
++    case HEVC_NAL_TRAIL_N:
++    case HEVC_NAL_TSA_N:
++    case HEVC_NAL_TSA_R:
++    case HEVC_NAL_STSA_N:
++    case HEVC_NAL_STSA_R:
++    case HEVC_NAL_BLA_W_LP:
++    case HEVC_NAL_BLA_W_RADL:
++    case HEVC_NAL_BLA_N_LP:
++    case HEVC_NAL_IDR_W_RADL:
++    case HEVC_NAL_IDR_N_LP:
++    case HEVC_NAL_CRA_NUT:
++    case HEVC_NAL_RADL_N:
++    case HEVC_NAL_RADL_R:
++    case HEVC_NAL_RASL_N:
++    case HEVC_NAL_RASL_R:
++        ret = hls_slice_header(s);
++        if (ret < 0)
++            return ret;
++
++        // The definition of _N unit types is "non-reference for other frames
++        // with the same temporal_id" so they may/will be ref frames for pics
++        // with a higher temporal_id.
++        s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
++            !is_non_ref_unit_type(s->nal_unit_type);
++        s->offload_recon = s->threads_type != 0 && s->used_for_ref;
++        s->is_irap = IS_IRAP(s);
++
++#if DEBUG_DECODE_N
++        {
++            static int z = 0;
++            if (IS_IDR(s)) {
++                z = 1;
++            }
++            if (z != 0 && z++ > DEBUG_DECODE_N) {
++                s->is_decoded = 0;
++                break;
++            }
++        }
++#endif
++        if (
++            (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
++            (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
++            (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
++            (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IRAP(s)))
++        {
++            s->is_decoded = 0;
++            break;
++        }
++
++        if (s->sh.first_slice_in_pic_flag) {
++            if (s->max_ra == INT_MAX) {
++                if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
++                    s->max_ra = s->poc;
++                } else {
++                    if (IS_IDR(s))
++                        s->max_ra = INT_MIN;
++                }
++            }
++
++            if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
++                s->poc <= s->max_ra) {
++                s->is_decoded = 0;
++                break;
++            } else {
++                if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
++                    s->max_ra = INT_MIN;
++            }
++
++            ret = hevc_frame_start(s);
++            if (ret < 0)
++                return ret;
++        } else if (!s->ref) {
++            av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
++            goto fail;
++        }
++
++        if (s->nal_unit_type != s->first_nal_type) {
++            av_log(s->avctx, AV_LOG_ERROR,
++                   "Non-matching NAL types of the VCL NALUs: %d %d\n",
++                   s->first_nal_type, s->nal_unit_type);
++            return AVERROR_INVALIDDATA;
++        }
++
++        if (!s->sh.dependent_slice_segment_flag &&
++            s->sh.slice_type != HEVC_SLICE_I) {
++            ret = ff_hevc_rpi_slice_rpl(s);
++            if (ret < 0) {
++                av_log(s->avctx, AV_LOG_WARNING,
++                       "Error constructing the reference lists for the current slice.\n");
++                goto fail;
++            }
++        }
++
++        ctb_addr_ts = hls_slice_data(s, nal);
++        if (ctb_addr_ts >= s->ps.sps->ctb_size) {
++            s->is_decoded = 1;
++        }
++
++        if (ctb_addr_ts < 0) {
++            ret = ctb_addr_ts;
++            goto fail;
++        }
++        break;
++    case HEVC_NAL_EOS_NUT:
++    case HEVC_NAL_EOB_NUT:
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra     = INT_MAX;
++        break;
++    case HEVC_NAL_AUD:
++    case HEVC_NAL_FD_NUT:
++        break;
++    default:
++        av_log(s->avctx, AV_LOG_INFO,
++               "Skipping NAL unit %d\n", s->nal_unit_type);
++    }
++
++    return 0;
++fail:
++    if (s->avctx->err_recognition & AV_EF_EXPLODE)
++        return ret;
++    return 0;
++}
++
++static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
++{
++    int i, ret = 0;
++    int eos_at_start = 1;
++
++    s->ref = NULL;
++    s->last_eos = s->eos;
++    s->eos = 0;
++
++    /* split the input packet into NAL units, so we know the upper bound on the
++     * number of slices in the frame */
++    ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
++                                s->nal_length_size, s->avctx->codec_id, 0, 0);
++    if (ret < 0) {
++        av_log(s->avctx, AV_LOG_ERROR,
++               "Error splitting the input into NAL units.\n");
++        return ret;
++    }
++
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
++            s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
++            if (eos_at_start) {
++                s->last_eos = 1;
++            } else {
++                s->eos = 1;
++            }
++        } else {
++            eos_at_start = 0;
++        }
++    }
++
++    /* decode the NAL units */
++    for (i = 0; i < s->pkt.nb_nals; i++) {
++        ret = decode_nal_unit(s, &s->pkt.nals[i]);
++        if (ret < 0) {
++            av_log(s->avctx, AV_LOG_WARNING,
++                   "Error parsing NAL unit #%d.\n", i);
++            goto fail;
++        }
++    }
++
++fail:  // Also success path
++    if (s->ref != NULL) {
++        if (s->used_for_ref && s->threads_type != 0) {
++            ff_hevc_rpi_progress_signal_all_done(s);
++        }
++        else {
++            // Flush frame to real memory as we expect to be able to pass
++            // it straight on to mmal
++            flush_frame(s, s->frame);
++        }
++    }
++    return ret;
++}
++
++static void print_md5(void *log_ctx, int level, uint8_t md5[16])
++{
++    int i;
++    for (i = 0; i < 16; i++)
++        av_log(log_ctx, level, "%02"PRIx8, md5[i]);
++}
++
++static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
++{
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++    int pixel_shift;
++    int i, j;
++
++    if (!desc)
++        return AVERROR(EINVAL);
++
++    pixel_shift = desc->comp[0].depth > 8;
++
++    av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
++           s->poc);
++
++    /* the checksums are LE, so we have to byteswap for >8bpp formats
++     * on BE arches */
++#if HAVE_BIGENDIAN
++    if (pixel_shift && !s->checksum_buf) {
++        av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
++                       FFMAX3(frame->linesize[0], frame->linesize[1],
++                              frame->linesize[2]));
++        if (!s->checksum_buf)
++            return AVERROR(ENOMEM);
++    }
++#endif
++
++    for (i = 0; frame->data[i]; i++) {
++        int width  = s->avctx->coded_width;
++        int height = s->avctx->coded_height;
++        int w = (i == 1 || i == 2) ? (width  >> desc->log2_chroma_w) : width;
++        int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
++        uint8_t md5[16];
++
++        av_md5_init(s->md5_ctx);
++        for (j = 0; j < h; j++) {
++            const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
++#if HAVE_BIGENDIAN
++            if (pixel_shift) {
++                s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
++                                    (const uint16_t *) src, w);
++                src = s->checksum_buf;
++            }
++#endif
++            av_md5_update(s->md5_ctx, src, w << pixel_shift);
++        }
++        av_md5_final(s->md5_ctx, md5);
++
++        if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
++            av_log   (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
++            print_md5(s->avctx, AV_LOG_DEBUG, md5);
++            av_log   (s->avctx, AV_LOG_DEBUG, "; ");
++        } else {
++            av_log   (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
++            print_md5(s->avctx, AV_LOG_ERROR, md5);
++            av_log   (s->avctx, AV_LOG_ERROR, " != ");
++            print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
++            av_log   (s->avctx, AV_LOG_ERROR, "\n");
++            return AVERROR_INVALIDDATA;
++        }
++    }
++
++    av_log(s->avctx, AV_LOG_DEBUG, "\n");
++
++    return 0;
++}
++
++static int all_sps_supported(const HEVCRpiContext * const s)
++{
++    for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (s->ps.sps_list[i] != NULL)
++        {
++            const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            if (!is_sps_supported(sps))
++                return 0;
++        }
++    }
++    return 1;
++}
++
++static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
++{
++    int ret, i;
++
++    ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
++                                   &s->nal_length_size, s->avctx->err_recognition,
++                                   s->apply_defdispwin, s->avctx);
++    if (ret < 0)
++        return ret;
++
++    /* export stream parameters from the first SPS */
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        if (first && s->ps.sps_list[i]) {
++            const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
++            export_stream_params(s->avctx, &s->ps, sps);
++            break;
++        }
++    }
++
++    return 0;
++}
++
++static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
++                             AVPacket *avpkt)
++{
++    int ret;
++    int new_extradata_size;
++    uint8_t *new_extradata;
++    HEVCRpiContext *s = avctx->priv_data;
++
++    if (!avpkt->size) {
++        ret = ff_hevc_rpi_output_frame(s, data, 1);
++        if (ret < 0)
++            return ret;
++
++        *got_output = ret;
++        return 0;
++    }
++
++    new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &new_extradata_size);
++    if (new_extradata && new_extradata_size > 0) {
++        ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
++        if (ret < 0)
++            return ret;
++    }
++
++    s->ref = NULL;
++    ret    = decode_nal_units(s, avpkt->data, avpkt->size);
++    if (ret < 0)
++        return ret;
++
++    /* verify the SEI checksum */
++    if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
++        s->sei.picture_hash.is_md5) {
++        ret = verify_md5(s, s->ref->frame);
++        if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
++            ff_hevc_rpi_unref_frame(s, s->ref, ~0);
++            return ret;
++        }
++    }
++    s->sei.picture_hash.is_md5 = 0;
++
++    if (s->is_decoded) {
++        av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
++        s->is_decoded = 0;
++    }
++
++    if (s->output_frame->buf[0]) {
++        av_frame_move_ref(data, s->output_frame);
++        *got_output = 1;
++    }
++
++    return avpkt->size;
++}
++
++static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
++{
++    int ret;
++
++    ret = ff_thread_ref_frame(&dst->tf, &src->tf);
++    if (ret < 0)
++        return ret;
++
++    if (src->col_mvf_buf != NULL)
++    {
++        dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
++        if (!dst->col_mvf_buf)
++            goto fail;
++    }
++    dst->col_mvf = src->col_mvf;
++
++    dst->poc        = src->poc;
++    dst->flags      = src->flags;
++    dst->sequence   = src->sequence;
++    return 0;
++
++fail:
++    ff_hevc_rpi_unref_frame(s, dst, ~0);
++    return AVERROR(ENOMEM);
++}
++
++
++static av_cold int hevc_decode_free(AVCodecContext *avctx)
++{
++    HEVCRpiContext * const s = avctx->priv_data;
++    int i;
++
++    pic_arrays_free(s);
++
++    av_freep(&s->md5_ctx);
++
++    av_freep(&s->cabac_save);
++
++#if RPI_EXTRA_BIT_THREADS
++    bit_threads_kill(s);
++#endif
++
++    hevc_exit_worker(s);
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_kill_state(s->progress_states + i);
++    }
++    job_lc_kill(s->HEVClc);
++
++    av_freep(&s->sao_pixel_buffer_h[0]);  // [1] & [2] allocated with [0]
++    av_freep(&s->sao_pixel_buffer_v[0]);
++    av_frame_free(&s->output_frame);
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        av_frame_free(&s->DPB[i].frame);
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
++        av_buffer_unref(&s->ps.vps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
++        av_buffer_unref(&s->ps.sps_list[i]);
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
++        av_buffer_unref(&s->ps.pps_list[i]);
++    s->ps.sps = NULL;
++    s->ps.pps = NULL;
++    s->ps.vps = NULL;
++
++    // Free separately from sLists as used that way by RPI WPP
++    for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
++        av_freep(s->HEVClcList + i);
++    }
++    s->HEVClc = NULL;  // Allocated as part of HEVClcList
++
++    ff_h2645_packet_uninit(&s->pkt);
++
++    if (s->qpu_init_ok)
++        vpu_qpu_term();
++    s->qpu_init_ok = 0;
++
++    return 0;
++}
++
++
++static av_cold int hevc_init_context(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int i;
++
++    s->avctx = avctx;
++
++    s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
++    if (!s->HEVClc)
++        goto fail;
++    s->HEVClcList[0] = s->HEVClc;
++
++    if (vpu_qpu_init() != 0)
++        goto fail;
++    s->qpu_init_ok = 1;
++
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    {
++        static const uint32_t dframe[1] = {0x80808080};
++        s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
++    }
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    s->qpu_dummy_frame_qpu = qpu_dummy();
++#endif
++
++    bt_lc_init(s, s->HEVClc, 0);
++    job_lc_init(s->HEVClc);
++
++    for (i = 0; i != 2; ++i) {
++        ff_hevc_rpi_progress_init_state(s->progress_states + i);
++    }
++
++    if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
++        goto fail;
++
++     if ((s->output_frame = av_frame_alloc()) == NULL)
++        goto fail;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        s->DPB[i].frame = av_frame_alloc();
++        if (!s->DPB[i].frame)
++            goto fail;
++        s->DPB[i].tf.f = s->DPB[i].frame;
++        s->DPB[i].dpb_no = i;
++    }
++
++    s->max_ra = INT_MAX;
++
++    if ((s->md5_ctx = av_md5_alloc()) == NULL)
++        goto fail;
++
++    s->context_initialized = 1;
++    s->eos = 0;
++
++    ff_hevc_rpi_reset_sei(&s->sei);
++
++    return 0;
++
++fail:
++    av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
++    hevc_decode_free(avctx);
++    return AVERROR(ENOMEM);
++}
++
++#if HAVE_THREADS
++static int hevc_update_thread_context(AVCodecContext *dst,
++                                      const AVCodecContext *src)
++{
++    HEVCRpiContext *s  = dst->priv_data;
++    HEVCRpiContext *s0 = src->priv_data;
++    int i, ret;
++
++    av_assert0(s->context_initialized);
++
++    // dst == src can happen according to the comments and in that case
++    // there is nothing to do here
++    if (dst == src)
++        return 0;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
++        ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
++        if (s0->DPB[i].frame->buf[0]) {
++            ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
++            if (ret < 0)
++                return ret;
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        s->ps.sps = NULL;
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
++        av_buffer_unref(&s->ps.vps_list[i]);
++        if (s0->ps.vps_list[i]) {
++            s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
++            if (!s->ps.vps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
++        av_buffer_unref(&s->ps.sps_list[i]);
++        if (s0->ps.sps_list[i]) {
++            s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
++            if (!s->ps.sps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
++        av_buffer_unref(&s->ps.pps_list[i]);
++        if (s0->ps.pps_list[i]) {
++            s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
++            if (!s->ps.pps_list[i])
++                return AVERROR(ENOMEM);
++        }
++    }
++
++    if (s->ps.sps != s0->ps.sps)
++        if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
++            return ret;
++
++    s->seq_decode = s0->seq_decode;
++    s->seq_output = s0->seq_output;
++    s->pocTid0    = s0->pocTid0;
++    s->max_ra     = s0->max_ra;
++    s->eos        = s0->eos;
++    s->no_rasl_output_flag = s0->no_rasl_output_flag;
++
++    s->is_nalff        = s0->is_nalff;
++    s->nal_length_size = s0->nal_length_size;
++
++    s->threads_type        = s0->threads_type;
++
++    if (s0->eos) {
++        s->seq_decode = (s->seq_decode + 1) & 0xff;
++        s->max_ra = INT_MAX;
++    }
++
++    s->sei.frame_packing        = s0->sei.frame_packing;
++    s->sei.display_orientation  = s0->sei.display_orientation;
++    s->sei.mastering_display    = s0->sei.mastering_display;
++    s->sei.content_light        = s0->sei.content_light;
++    s->sei.alternative_transfer = s0->sei.alternative_transfer;
++
++    // * We do this here as it allows us to easily locate our parents
++    //   global job pool, but there really should be a less nasty way
++    if (s->jbc == NULL)
++    {
++        av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
++        hevc_init_worker(s);
++    }
++
++    return 0;
++}
++#endif
++
++#include <sys/stat.h>
++static int qpu_ok(void)
++{
++    static int is_pi3 = -1;
++    if (is_pi3 == -1)
++    {
++        struct stat sb;
++        is_pi3 = (stat("/dev/rpivid-intcmem", &sb) != 0);
++    }
++    return is_pi3;
++}
++
++static av_cold int hevc_decode_init(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    int ret;
++
++    if (!qpu_ok())
++        return AVERROR_DECODER_NOT_FOUND;
++
++    if ((ret = hevc_init_context(avctx)) < 0)
++        return ret;
++
++    // If we are a child context then stop now
++    // Everything after this point is either 1st decode setup or global alloc
++    // that must not be repeated
++    // Global info will be copied into children in update_thread_context (we
++    // can't do it here as we have no way of finding the parent context)
++    if (avctx->internal->is_copy)
++        return 0;
++
++    // Job allocation requires VCSM alloc to work so ensure that we have it
++    // initialised by this point
++    {
++        HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
++        if (jbg == NULL) {
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
++            ret = AVERROR(ENOMEM);
++            goto fail;
++        }
++
++        if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL) {
++            av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
++            ret = AVERROR(ENOMEM);
++            goto fail;
++        }
++    }
++
++    hevc_init_worker(s);
++
++    s->eos = 1;
++
++    if (avctx->extradata_size > 0 && avctx->extradata) {
++        if ((ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1)) < 0)
++            goto fail;
++
++        if (!all_sps_supported(s)) {
++            ret = AVERROR_DECODER_NOT_FOUND;
++            goto fail;
++        }
++    }
++
++    if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
++        s->threads_type = FF_THREAD_FRAME;
++    else
++        s->threads_type = 0;
++
++    return 0;
++
++fail:
++    hevc_decode_free(avctx);
++    return ret;
++}
++
++static void hevc_decode_flush(AVCodecContext *avctx)
++{
++    HEVCRpiContext *s = avctx->priv_data;
++    ff_hevc_rpi_flush_dpb(s);
++    s->max_ra = INT_MAX;
++    s->eos = 1;
++}
++
++typedef struct  hwaccel_rpi3_qpu_env_s {
++    const AVClass *av_class;
++    AVZcEnvPtr zc;
++} hwaccel_rpi3_qpu_env_t;
++
++static int hwaccel_alloc_frame(AVCodecContext *s, AVFrame *frame)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = s->internal->hwaccel_priv_data;
++    int rv;
++
++    if (av_rpi_zc_in_use(s))
++    {
++        rv = s->get_buffer2(s, frame, 0);
++    }
++    else
++    {
++        rv = av_rpi_zc_get_buffer(r3->zc, frame);
++        if (rv == 0)
++            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);  // actually do the alloc
++    }
++
++    if (rv == 0 &&
++        (rv = ff_attach_decode_data(frame)) < 0)
++    {
++        av_frame_unref(frame);
++    }
++
++    return rv;
++}
++
++static int hwaccel_rpi3_qpu_free(AVCodecContext *avctx)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++    av_rpi_zc_int_env_freep(&r3->zc);
++    return 0;
++}
++
++static int hwaccel_rpi3_qpu_init(AVCodecContext *avctx)
++{
++    hwaccel_rpi3_qpu_env_t * const r3 = avctx->internal->hwaccel_priv_data;
++
++    if ((r3->zc = av_rpi_zc_int_env_alloc(avctx)) == NULL)
++        goto fail;
++
++    return 0;
++
++fail:
++    av_log(avctx, AV_LOG_ERROR, "Rpi3 QPU init failed\n");
++    hwaccel_rpi3_qpu_free(avctx);
++    return AVERROR(ENOMEM);
++}
++
++
++#define OFFSET(x) offsetof(HEVCRpiContext, x)
++#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
++
++
++static const AVOption options[] = {
++    { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
++        AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
++    { NULL },
++};
++
++static const AVClass hevc_rpi_decoder_class = {
++    .class_name = "HEVC RPI decoder",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++};
++
++static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
++    AV_PIX_FMT_SAND128,
++    AV_PIX_FMT_SAND64_10,
++    AV_PIX_FMT_NONE
++};
++
++
++static const AVHWAccel hwaccel_rpi3_qpu = {
++    .name           = "Pi3 QPU Hwaccel",
++    .alloc_frame    = hwaccel_alloc_frame,
++    .init           = hwaccel_rpi3_qpu_init,
++    .uninit         = hwaccel_rpi3_qpu_free,
++    .priv_data_size = sizeof(hwaccel_rpi3_qpu_env_t),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand128 =
++{
++    .public = {
++        .pix_fmt = AV_PIX_FMT_SAND128,
++        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++        .device_type = AV_HWDEVICE_TYPE_NONE,
++    },
++    .hwaccel = &hwaccel_rpi3_qpu
++};
++static const AVCodecHWConfigInternal hevc_rpi_hw_config_sand64_10 =
++{
++    .public = {
++        .pix_fmt = AV_PIX_FMT_SAND64_10,
++        .methods = AV_CODEC_HW_CONFIG_METHOD_AD_HOC,
++        .device_type = AV_HWDEVICE_TYPE_NONE,
++    },
++    .hwaccel = &hwaccel_rpi3_qpu
++};
++
++
++static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
++    &hevc_rpi_hw_config_sand128,
++    &hevc_rpi_hw_config_sand64_10,
++    NULL
++};
++
++
++AVCodec ff_hevc_rpi_decoder = {
++    .name                  = "hevc_rpi",
++    .long_name             = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
++    .type                  = AVMEDIA_TYPE_VIDEO,
++    .id                    = AV_CODEC_ID_HEVC,
++    .priv_data_size        = sizeof(HEVCRpiContext),
++    .priv_class            = &hevc_rpi_decoder_class,
++    .init                  = hevc_decode_init,
++    .close                 = hevc_decode_free,
++    .decode                = hevc_rpi_decode_frame,
++    .flush                 = hevc_decode_flush,
++    .update_thread_context = ONLY_IF_THREADS_ENABLED(hevc_update_thread_context),
++    .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
++                             AV_CODEC_CAP_HARDWARE |
++                             AV_CODEC_CAP_AVOID_PROBING |
++#if 0
++    // Debugging is often easier without threads getting in the way
++                            0,
++#warning H265 threading turned off
++#else
++    // We only have decent optimisation for frame - so only admit to that
++                             AV_CODEC_CAP_FRAME_THREADS,
++#endif
++    .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
++                             FF_CODEC_CAP_EXPORTS_CROPPING |
++                             FF_CODEC_CAP_ALLOCATE_PROGRESS,
++    .pix_fmts              = hevc_rpi_pix_fmts,
++    .profiles              = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
++    .hw_configs            = hevc_rpi_hw_configs,
++//    .wrapper_name          = "hevc_rpi",
++};
++
+diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
+new file mode 100644
+index 0000000000..1f94d18673
+--- /dev/null
++++ b/libavcodec/rpi_hevcdec.h
+@@ -0,0 +1,1091 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDEC_H
++#define AVCODEC_RPI_HEVCDEC_H
++
++#include "config.h"
++
++#include <stdatomic.h>
++
++#include "libavutil/buffer.h"
++
++#include "avcodec.h"
++#include "bswapdsp.h"
++#include "cabac.h"
++#include "get_bits.h"
++#include "rpi_hevcpred.h"
++#include "h2645_parse.h"
++#include "hevc.h"
++#include "rpi_hevc_mv.h"
++#include "rpi_hevc_ps.h"
++#include "rpi_hevc_sei.h"
++#include "rpi_hevcdsp.h"
++#include "internal.h"
++#include "thread.h"
++#include "videodsp.h"
++
++#if ARCH_ARM
++#include "arm/rpi_hevc_misc_neon.h"
++#endif
++
++#define MAX_NB_THREADS 16
++#define SHIFT_CTB_WPP 2
++
++//TODO: check if this is really the maximum
++#define MAX_TRANSFORM_DEPTH 5
++
++#define MAX_TB_SIZE 32
++#define MAX_QP 51
++#define DEFAULT_INTRA_TC_OFFSET 2
++
++#define HEVC_CONTEXTS 199
++
++#define MRG_MAX_NUM_CANDS     5
++
++#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE)  // 64
++
++// Size of DPB array
++#define HEVC_DPB_ELS            32
++
++#define L0 0
++#define L1 1
++
++#define EPEL_EXTRA_BEFORE 1
++#define EPEL_EXTRA_AFTER  2
++#define EPEL_EXTRA        3
++#define QPEL_EXTRA_BEFORE 3
++#define QPEL_EXTRA_AFTER  4
++#define QPEL_EXTRA        7
++
++#define EDGE_EMU_BUFFER_STRIDE 80
++
++#include <semaphore.h>
++#include "rpi_qpu.h"
++
++// Max jobs per frame thread. Actual usage will be limited by the size
++// of the global job pool
++// ?? Limits
++#define RPI_MAX_JOBS            8
++
++// This is the number of _extra_ bit threads - we will have
++// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
++//
++// 0 is legitimate and will disable our WPP processing
++//#define RPI_EXTRA_BIT_THREADS 0
++#define RPI_EXTRA_BIT_THREADS   2
++
++// Number of separate threads/passes in worker
++// 2 and 3 are the currently valid numbers
++// At the moment 3 seems fractionally faster
++//#define RPI_PASSES              2
++#define RPI_PASSES              3
++
++// Print out various usage stats
++#define RPI_TSTATS              0
++
++// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
++#define RPI_COMPRESS_COEFFS     1
++
++// Wait for VPU/QPU to finish in worker pass 0
++// If 0 then the wait is in pass 1
++//
++// One might expect the better place to wait would be in pass 1 however
++// testing shows that pass 0 produces overall faster decode.
++// Interestingly it is QPU/VPU limited streams that seem to suffer
++// from pass 1 waits, CPU limited ones tend to show a very mild gain.
++// This define exists so it is easy to test this.
++#define RPI_WORKER_WAIT_PASS_0  1
++
++// Use ARM emulation of QPU pred
++// These are for debug only as the emulation makes only limited
++// effort to be fast
++#define RPI_QPU_EMU_Y           0
++#define RPI_QPU_EMU_C           0
++
++// Max width & height we are prepared to consider
++// Sand frame shape calc becomes confused with large frames
++// Some buffer alloc also depends on this
++#define HEVC_RPI_MAX_WIDTH      2048
++#define HEVC_RPI_MAX_HEIGHT     1088
++
++
++// Min CTB size is 16
++#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
++
++/**
++ * Value of the luma sample at position (x, y) in the 2D array tab.
++ */
++#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
++#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
++
++#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
++#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
++                   (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
++#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
++
++enum RPSType {
++    ST_CURR_BEF = 0,
++    ST_CURR_AFT,
++    ST_FOLL,
++    LT_CURR,
++    LT_FOLL,
++    NB_RPS_TYPE,
++};
++
++enum SyntaxElement {
++    SAO_MERGE_FLAG = 0,
++    SAO_TYPE_IDX,
++    SAO_EO_CLASS,
++    SAO_BAND_POSITION,
++    SAO_OFFSET_ABS,
++    SAO_OFFSET_SIGN,
++    END_OF_SLICE_FLAG,
++    SPLIT_CODING_UNIT_FLAG,
++    CU_TRANSQUANT_BYPASS_FLAG,
++    SKIP_FLAG,
++    CU_QP_DELTA,
++    PRED_MODE_FLAG,
++    PART_MODE,
++    PCM_FLAG,
++    PREV_INTRA_LUMA_PRED_FLAG,
++    MPM_IDX,
++    REM_INTRA_LUMA_PRED_MODE,
++    INTRA_CHROMA_PRED_MODE,
++    MERGE_FLAG,
++    MERGE_IDX,
++    INTER_PRED_IDC,
++    REF_IDX_L0,
++    REF_IDX_L1,
++    ABS_MVD_GREATER0_FLAG,
++    ABS_MVD_GREATER1_FLAG,
++    ABS_MVD_MINUS2,
++    MVD_SIGN_FLAG,
++    MVP_LX_FLAG,
++    NO_RESIDUAL_DATA_FLAG,
++    SPLIT_TRANSFORM_FLAG,
++    CBF_LUMA,
++    CBF_CB_CR,
++    TRANSFORM_SKIP_FLAG,
++    EXPLICIT_RDPCM_FLAG,
++    EXPLICIT_RDPCM_DIR_FLAG,
++    LAST_SIGNIFICANT_COEFF_X_PREFIX,
++    LAST_SIGNIFICANT_COEFF_Y_PREFIX,
++    LAST_SIGNIFICANT_COEFF_X_SUFFIX,
++    LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
++    SIGNIFICANT_COEFF_GROUP_FLAG,
++    SIGNIFICANT_COEFF_FLAG,
++    COEFF_ABS_LEVEL_GREATER1_FLAG,
++    COEFF_ABS_LEVEL_GREATER2_FLAG,
++    COEFF_ABS_LEVEL_REMAINING,
++    COEFF_SIGN_FLAG,
++    LOG2_RES_SCALE_ABS,
++    RES_SCALE_SIGN_FLAG,
++    CU_CHROMA_QP_OFFSET_FLAG,
++    CU_CHROMA_QP_OFFSET_IDX,
++};
++
++enum PartMode {
++    PART_2Nx2N = 0,
++    PART_2NxN  = 1,
++    PART_Nx2N  = 2,
++    PART_NxN   = 3,
++    PART_2NxnU = 4,
++    PART_2NxnD = 5,
++    PART_nLx2N = 6,
++    PART_nRx2N = 7,
++};
++
++enum PredMode {
++    MODE_INTER = 0,
++    MODE_INTRA,
++    MODE_SKIP,
++};
++
++enum InterPredIdc {
++    PRED_L0 = 0,
++    PRED_L1,
++    PRED_BI,
++};
++
++enum PredFlag {
++    PF_INTRA = 0,
++    PF_L0,
++    PF_L1,
++    PF_BI,
++};
++
++enum SAOType {
++    SAO_NOT_APPLIED = 0,
++    SAO_BAND,
++    SAO_EDGE,
++    SAO_APPLIED
++};
++
++enum SAOEOClass {
++    SAO_EO_HORIZ = 0,
++    SAO_EO_VERT,
++    SAO_EO_135D,
++    SAO_EO_45D,
++};
++
++enum ScanType {
++    SCAN_DIAG = 0,
++    SCAN_HORIZ,
++    SCAN_VERT,
++};
++
++typedef struct RefPicList {
++    struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
++    int list[HEVC_MAX_REFS];
++    uint8_t isLongTerm[HEVC_MAX_REFS];
++    int nb_refs;
++} RefPicList;
++
++typedef struct RefPicListTab {
++    RefPicList refPicList[2];
++} RefPicListTab;
++
++typedef struct RpiCodingUnit {
++    unsigned int x;             // Passed to deblock
++    unsigned int y;
++    unsigned int x_split;
++    unsigned int y_split;
++
++    enum PredMode pred_mode;    ///< PredMode
++    enum PartMode part_mode;    ///< PartMode
++
++    // Inferred parameters
++    uint8_t intra_split_flag;   ///< IntraSplitFlag
++    uint8_t max_trafo_depth;    ///< MaxTrafoDepth
++    uint8_t cu_transquant_bypass_flag;
++} RpiCodingUnit;
++
++typedef struct RpiPredictionUnit {
++    uint8_t intra_pred_mode[4];
++    uint8_t intra_pred_mode_c[4];
++    uint8_t chroma_mode_c[4];
++    uint8_t merge_flag;
++} RpiPredictionUnit;
++
++typedef struct HEVCRpiTransformUnit {
++    int8_t cu_qp_delta;
++
++    // Inferred parameters;
++    uint8_t intra_pred_mode;
++    uint8_t intra_pred_mode_c;
++    uint8_t chroma_mode_c;
++    uint8_t is_cu_qp_delta_wanted;
++    uint8_t cu_chroma_qp_offset_wanted;
++    const int8_t * qp_divmod6[3];
++} HEVCRpiTransformUnit;
++
++typedef struct DBParams {
++    int8_t beta_offset; // -12 to +12
++    int8_t tc_offset;   // -12 to +12
++} DBParams;
++
++#define HEVC_FRAME_FLAG_OUTPUT    (1 << 0)
++#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
++#define HEVC_FRAME_FLAG_LONG_REF  (1 << 2)
++#define HEVC_FRAME_FLAG_BUMPING   (1 << 3)
++
++struct HEVCRpiJob;
++
++typedef struct HEVCRpiFrame {
++    AVFrame *frame;
++    ThreadFrame tf;
++    ColMvField *col_mvf;
++    int poc;
++    struct HEVCRpiFrame *collocated_ref;
++
++    AVBufferRef *col_mvf_buf;
++
++    /**
++     * A sequence counter, so that old frames are output first
++     * after a POC reset
++     */
++    uint16_t sequence;
++
++    /**
++     * A combination of HEVC_FRAME_FLAG_*
++     */
++    uint8_t flags;
++
++    // Entry no in DPB - can be used as a small unique
++    // frame identifier (within the current thread)
++    uint8_t dpb_no;
++} HEVCRpiFrame;
++
++typedef struct HEVCRpiLocalContext {
++    HEVCRpiTransformUnit tu;
++
++    CABACContext cc;
++
++    // Vars that allow us to locate everything from just an lc
++    struct HEVCRpiContext * context;  // ??? make const ???
++    unsigned int lc_n; // lc list el no
++
++    // Job wait links
++    struct HEVCRpiLocalContext * jw_next;
++    struct HEVCRpiLocalContext * jw_prev;
++    struct HEVCRpiLocalContext * ljw_next;
++    struct HEVCRpiLocalContext * ljw_prev;
++    struct HEVCRpiJob * volatile jw_job;
++    sem_t jw_sem;
++
++    // ?? Wrap in structure ??
++    sem_t bt_sem_in;
++    sem_t * bt_psem_out;
++    volatile int bt_terminate;
++    unsigned int ts;
++    unsigned int bt_last_line;  // Last line in this bit_thread chunk
++    unsigned int bt_line_no;
++    unsigned int bt_line_width;
++    unsigned int bt_line_inc;
++
++    struct HEVCRpiJob * jb0;
++    char unit_done;  // Set once we have dealt with this slice
++    char bt_is_tile;
++    char last_progress_good;
++    char cabac_init_req;
++
++    uint8_t cabac_state[HEVC_CONTEXTS];
++    uint8_t stat_coeff[4];
++    GetBitContext gb;
++
++    uint8_t ct_depth;
++    int8_t qp_y;
++    int8_t curr_qp_y;
++    int8_t qPy_pred;
++
++// N.B. Used by asm (neon) - do not change
++#define AVAIL_S_UR  0
++#define AVAIL_S_U   1
++#define AVAIL_S_UL  2
++#define AVAIL_S_L   3
++#define AVAIL_S_DL  4
++
++#define AVAIL_U     (1 << AVAIL_S_U)
++#define AVAIL_L     (1 << AVAIL_S_L)
++#define AVAIL_UL    (1 << AVAIL_S_UL)
++#define AVAIL_UR    (1 << AVAIL_S_UR)
++#define AVAIL_DL    (1 << AVAIL_S_DL)
++
++// Intra filters - same number space as avail
++#define FILTER_LIGHT    0x40
++#define FILTER_STRONG   0x80
++#define FILTER_EITHER   (FILTER_LIGHT | FILTER_STRONG)
++
++    uint8_t ctb_avail;
++    int     end_of_ctb_x;
++    int     end_of_ctb_y;
++
++    RpiCodingUnit cu;
++    RpiPredictionUnit pu;
++
++#define BOUNDARY_LEFT_SLICE     (1 << 0)
++#define BOUNDARY_LEFT_TILE      (1 << 1)
++#define BOUNDARY_UPPER_SLICE    (1 << 2)
++#define BOUNDARY_UPPER_TILE     (1 << 3)
++    /* properties of the boundary of the current CTB for the purposes
++     * of the deblocking filter */
++    unsigned int boundary_flags;
++
++#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
++    uint8_t ipm_left[IPM_TAB_SIZE];
++    uint8_t ipm_up[IPM_TAB_SIZE];
++
++//#define MVF_STASH_WIDTH       128
++#define MVF_STASH_WIDTH       64
++#define MVF_STASH_HEIGHT      64
++#define MVF_STASH_WIDTH_PU    (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
++#define MVF_STASH_HEIGHT_PU   (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
++    HEVCRpiMvField mvf_ul[1];
++    HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
++
++    /* +7 is for subpixel interpolation, *2 for high bit depths */
++//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++    /* The extended size between the new edge emu buffer is abused by SAO */
++//    DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
++//    DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
++
++} HEVCRpiLocalContext;
++
++// Each block can have an intra prediction and an add_residual command
++// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
++
++// Sand only has 2 planes (Y/C)
++#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
++
++// Command for intra prediction and transform_add of predictions to coefficients
++enum rpi_pred_cmd_e
++{
++    RPI_PRED_ADD_RESIDUAL,
++    RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
++    RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
++    RPI_PRED_ADD_DC,
++    RPI_PRED_ADD_DC_U,       // Both U & V are effectively C
++    RPI_PRED_ADD_DC_V,
++    RPI_PRED_INTRA,
++    RPI_PRED_INTRA_C,
++    RPI_PRED_I_PCM,
++    RPI_PRED_CMD_MAX
++};
++
++typedef struct HEVCPredCmd {
++    uint8_t type;
++    uint8_t size;  // log2 "size" used by all variants
++    uint8_t avail; // i_pred - but left here as they pack well
++    uint8_t dummy;
++    union {
++        struct {  // TRANSFORM_ADD
++            uint8_t * dst;
++            const int16_t * buf;
++            uint16_t stride;  // Should be good enough for all pic fmts we use
++            int16_t dc;
++        } ta;
++        struct {
++            uint8_t * dst;
++            uint32_t stride;
++            int dc;
++        } dc;
++        struct {  // INTRA
++            uint16_t x;
++            uint16_t y;
++            enum IntraPredMode mode;
++        } i_pred;
++        struct {  // I_PCM
++            uint16_t x;
++            uint16_t y;
++            const void * src;
++            uint32_t src_len;
++        } i_pcm;
++    };
++} HEVCPredCmd;
++
++union qpu_mc_pred_cmd_s;
++struct qpu_mc_pred_y_p_s;
++struct qpu_mc_src_s;
++
++typedef struct HEVCRpiInterPredQ
++{
++    union qpu_mc_pred_cmd_u *qpu_mc_base;
++    union qpu_mc_pred_cmd_u *qpu_mc_curr;
++    struct qpu_mc_src_s *last_l0;
++    struct qpu_mc_src_s *last_l1;
++    unsigned int load;
++    uint32_t code_setup;
++    uint32_t code_sync;
++    uint32_t code_exit;
++} HEVCRpiInterPredQ;
++
++typedef struct HEVCRpiInterPredEnv
++{
++    HEVCRpiInterPredQ * q;
++    uint8_t n;                  // Number of Qs
++    uint8_t n_grp;              // Number of Q in a group
++    uint8_t curr;               // Current Q number (0..n-1)
++    uint8_t used;               // 0 if nothing in any Q, 1 otherwise
++    uint8_t used_grp;           // 0 if nothing in any Q in the current group
++    unsigned int max_fill;
++    unsigned int min_gap;
++    GPU_MEM_PTR_T gptr;
++} HEVCRpiInterPredEnv;
++
++typedef struct HEVCRpiIntraPredEnv {
++    unsigned int n;        // Number of commands
++    HEVCPredCmd * cmds;
++} HEVCRpiIntraPredEnv;
++
++typedef struct HEVCRpiCoeffEnv {
++    unsigned int n;
++#if RPI_COMPRESS_COEFFS
++    unsigned int packed; // Equal to 1 if coefficients should be being packed
++    unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed).  Only valid if packed==0
++#endif
++    int16_t * buf;
++} HEVCRpiCoeffEnv;
++
++typedef struct HEVCRpiCoeffsEnv {
++    HEVCRpiCoeffEnv s[4];
++    GPU_MEM_PTR_T gptr;
++    void * mptr;
++} HEVCRpiCoeffsEnv;
++
++typedef struct HEVCRpiFrameProgressWait {
++    int req;
++    struct HEVCRpiFrameProgressWait * next;
++    sem_t sem;
++} HEVCRpiFrameProgressWait;
++
++typedef struct HEVCRpiFrameProgressState {
++    struct HEVCRpiFrameProgressWait * first;
++    struct HEVCRpiFrameProgressWait * last;
++    pthread_mutex_t lock;
++} HEVCRpiFrameProgressState;
++
++typedef struct RpiBlk
++{
++    unsigned int x;
++    unsigned int y;
++    unsigned int w;
++    unsigned int h;
++} RpiBlk;
++
++typedef struct HEVCRpiJob {
++    struct HEVCRpiJob * next;  // Free chain
++    struct HEVCRpiJobCtl * jbc_local;
++    const HEVCRpiSPS * sps;       // sps used to set up this job
++
++    int waited;
++    int ctu_ts_first;
++    int ctu_ts_last;
++    RpiBlk bounds;  // Bounding box of job
++
++    struct qpu_mc_pred_y_p_s * last_y8_p;
++    struct qpu_mc_src_s * last_y8_l1;
++    rpi_cache_flush_env_t * rfe;
++
++    HEVCRpiInterPredEnv chroma_ip;
++    HEVCRpiInterPredEnv luma_ip;
++    int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
++    HEVCRpiIntraPredEnv intra;
++    HEVCRpiCoeffsEnv coeffs;
++    HEVCRpiFrameProgressWait progress_wait;
++    sem_t sem;
++    rpi_cache_buf_t flush_buf;
++} HEVCRpiJob;
++
++struct HEVCRpiContext;
++
++typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
++
++typedef struct HEVCRpiPassQueue
++{
++//    int pending;
++    volatile int terminate;
++    sem_t sem_in;
++    sem_t * psem_out;
++    unsigned int job_n;
++    struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
++    HEVCRpiWorkerFn * worker;
++    pthread_t thread;
++    uint8_t pass_n;  // Pass number - debug
++    uint8_t started;
++} HEVCRpiPassQueue;
++
++
++struct HEVCRpiJobGlobal;
++
++typedef struct HEVCRpiJobCtl
++{
++    sem_t sem_out;
++
++    HEVCRpiJob * volatile jb1;  // The job associated with this frame if unallocated - NULL if allocated
++    struct HEVCRpiJobGlobal * jbg;
++
++    HEVCRpiLocalContext * lcw_head;
++    HEVCRpiLocalContext * lcw_tail;
++
++    pthread_mutex_t in_lock;
++    int offload_in;
++
++    HEVCRpiJob *offloadq[RPI_MAX_JOBS];
++} HEVCRpiJobCtl;
++
++
++typedef struct HEVCRpiJobGlobal
++{
++    intptr_t ref_count;
++    pthread_mutex_t lock;
++    HEVCRpiJob * free1;                 // Singly linked list of free jobs
++    HEVCRpiLocalContext * wait_head;       // Double linked list of lcs waiting for a job
++    HEVCRpiLocalContext * wait_good;  // Last good tail
++    HEVCRpiLocalContext * wait_tail;
++
++} HEVCRpiJobGlobal;
++
++#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
++
++#if RPI_TSTATS
++typedef struct HEVCRpiStats {
++    int y_pred1_y8_merge;
++    int y_pred1_xy;
++    int y_pred1_x0;
++    int y_pred1_y0;
++    int y_pred1_x0y0;
++    int y_pred1_wle8;
++    int y_pred1_wgt8;
++    int y_pred1_hle16;
++    int y_pred1_hgt16;
++    int y_pred2_xy;
++    int y_pred2_x0;
++    int y_pred2_y0;
++    int y_pred2_x0y0;
++    int y_pred2_hle16;
++    int y_pred2_hgt16;
++} HEVCRpiStats;
++#endif
++
++typedef struct HEVCRpiCabacState
++{
++    uint8_t rice[4];
++    uint8_t state[HEVC_CONTEXTS];
++} HEVCRpiCabacState;
++
++#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT   6   // 64 pels
++#define HEVC_RPI_BS_STRIDE1_PELS        (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_PEL_MASK    (HEVC_RPI_BS_STRIDE1_PELS - 1)
++#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT  2   // 4 els per byte
++#define HEVC_RPI_BS_PELS_PER_EL_SHIFT   2   // 4 pels per el
++#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT  (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
++#define HEVC_RPI_BS_STRIDE1_BYTES       (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++#define HEVC_RPI_BS_Y_SHR               3   // 8 vertical pels per row
++#define HEVC_RPI_BS_COL_BYTES_SHR       (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
++
++typedef struct HEVCRpiContext {
++    const AVClass *c;  // needed by private avoptions
++    AVCodecContext *avctx;
++
++    uint8_t             threads_type;
++    char qpu_init_ok;
++
++    /** 1 if the independent slice segment header was successfully parsed */
++    uint8_t slice_initialized;
++    char used_for_ref;  // rpi
++    char is_irap;
++    char offload_recon;
++    uint8_t eos;       ///< current packet contains an EOS/EOB NAL
++    uint8_t last_eos;  ///< last packet contains an EOS/EOB NAL
++    uint8_t no_backward_pred_flag;
++    uint8_t is_decoded;
++    uint8_t no_rasl_output_flag;
++
++
++    /**
++     * Sequence counters for decoded and output frames, so that old
++     * frames are output first after a POC reset
++     */
++    uint16_t seq_decode;
++    uint16_t seq_output;
++
++    int                 width;
++    int                 height;
++
++    HEVCRpiJobCtl * jbc;
++    // cabac stash
++    // b0       skip flag
++    // b1+      ct_depth
++    uint8_t * cabac_stash_left;
++    uint8_t * cabac_stash_up;
++
++    // Function pointers
++#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
++    const uint8_t * qpu_dummy_frame_emu;
++#endif
++#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
++    uint32_t qpu_dummy_frame_qpu;  // Not a frame - just a bit of memory
++#endif
++    HEVCRpiQpu qpu;
++
++    HEVCRpiFrameProgressState progress_states[2];
++
++    HEVCRpiCabacState *cabac_save;
++
++    AVFrame *frame;
++    AVFrame *output_frame;
++    uint8_t *sao_pixel_buffer_h[3];
++    uint8_t *sao_pixel_buffer_v[3];
++
++    unsigned int col_mvf_stride;
++    AVBufferPool *col_mvf_pool;
++
++    RpiSAOParams *sao;
++    DBParams *deblock;
++    enum HEVCNALUnitType nal_unit_type;
++    int temporal_id;  ///< temporal_id_plus1 - 1
++    HEVCRpiFrame *ref;
++    int poc;
++    int pocTid0;
++    int slice_idx; ///< number of the slice being currently decoded
++    int max_ra;
++
++    int8_t *qp_y_tab;
++
++    // Deblocking block strength bitmaps
++    unsigned int bs_stride2;
++    unsigned int bs_size;
++    uint8_t *bs_horizontal;
++    uint8_t *bs_vertical;
++    uint8_t *bsf_stash_up;
++    uint8_t *bsf_stash_left;
++
++#if HEVC_RPI_MAX_CTBS >= 0xffff
++#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
++    uint32_t *tab_slice_address;
++#else
++#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
++    uint16_t *tab_slice_address;
++#endif
++
++    // Bitfield 1 bit per 8 pels (min pcm size)
++    uint8_t *is_pcm;
++    // Bitfield 1 bit per 8 pels (min cb size)
++    // Only needed for CIP as CIP processing is async to the main thread
++    uint8_t *is_intra;
++
++    // PU
++    HEVCRpiMvField *mvf_up;
++    HEVCRpiMvField *mvf_left;
++
++    const RefPicList **rpl_up;
++    const RefPicList **rpl_left;
++    RefPicList * refPicList;
++
++    // CTB-level flags affecting loop filter operation
++    uint8_t *filter_slice_edges;
++
++    /** used on BE to byteswap the lines for checksumming */
++    uint8_t *checksum_buf;
++    int      checksum_buf_size;
++
++    const uint8_t *data;
++
++    H2645Packet pkt;
++    // type of the first VCL NAL of the current frame
++    enum HEVCNALUnitType first_nal_type;
++
++    uint8_t context_initialized;
++    int is_nalff;           ///< this flag is != 0 if bitstream is encapsulated
++                            ///< as a format defined in 14496-15
++    int apply_defdispwin;
++
++    int nal_length_size;    ///< Number of bytes used for nal length (1, 2 or 4)
++    int nuh_layer_id;
++
++    struct AVMD5 *md5_ctx;
++
++    RefPicListTab * rpl_tab;
++    unsigned int rpl_tab_size;
++
++    uint8_t *is_intra_store;
++
++    RpiSliceHeader sh;
++
++    HEVCRpiParamSets ps;
++
++    HEVCRpiLocalContext    *HEVClc;
++    HEVCRpiLocalContext    *HEVClcList[MAX_NB_THREADS];
++
++    HEVCRpiFrame DPB[HEVC_DPB_ELS];
++
++    ///< candidate references for the current frame
++    RefPicList rps[5];
++
++    HEVCRpiPredContext hpc;
++    HEVCDSPContext hevcdsp;
++
++    HEVCSEIContext sei;
++
++    // Put structures that allocate non-trivial storage at the end
++    // These are mostly used indirectly so position in the structure doesn't matter
++    HEVCRpiPassQueue passq[RPI_PASSES];
++#if RPI_EXTRA_BIT_THREADS > 0
++    int bt_started;
++    // This simply contains thread descriptors - task setup is held elsewhere
++    pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
++#endif
++#if RPI_TSTATS
++    HEVCRpiStats tstats;
++#endif
++} HEVCRpiContext;
++
++/**
++ * Mark all frames in DPB as unused for reference.
++ */
++void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
++
++/**
++ * Drop all frames currently in DPB.
++ */
++void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture sets for the current frame.
++ */
++int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
++
++/**
++ * Construct the reference picture list(s) for the current slice.
++ */
++int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
++
++
++/**
++ * Get the number of candidate references for the current frame.
++ */
++int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
++
++int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
++
++/**
++ * Find next frame in output order and put a reference to it in frame.
++ * @return 1 if a frame was output, 0 otherwise
++ */
++int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
++
++void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
++
++void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
++
++unsigned int ff_hevc_rpi_tb_avail_flags(
++    const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++    const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
++
++void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
++                                int nPbH, int log2_cb_size, int part_idx,
++                                int merge_idx, HEVCRpiMvField * const mv);
++void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
++    const unsigned int x0, const unsigned int y0,
++    const unsigned int nPbW, const unsigned int nPbH,
++    const unsigned int avail,
++    HEVCRpiMvField * const mv,
++    const unsigned int mvp_lx_flag, const unsigned int LX);
++void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
++void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
++                                               const unsigned int x0, const unsigned int y0,
++                                               const unsigned int log2_trafo_size, const int is_coded_block);
++int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
++
++extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
++extern const uint8_t ff_hevc_rpi_qpel_extra[4];
++
++int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
++
++// arm/hevc_misc_neon.S
++// Neon coeff zap fn
++#if HAVE_NEON
++extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
++#endif
++
++void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int val, const int field);
++
++void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
++
++// All of these expect that s->threads_type == FF_THREAD_FRAME
++
++static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int y)
++{
++    if (s->threads_type != 0)
++        ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref && s->threads_type != 0)
++        ff_hevc_rpi_progress_signal_field(s, y, 1);
++}
++
++static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
++                                     const HEVCRpiFrame * const ref, const int y)
++{
++    ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
++}
++
++static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
++{
++    if (s->used_for_ref && s->threads_type != 0)
++    {
++        ff_hevc_rpi_progress_signal_field(s, y, 0);
++    }
++}
++
++static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
++{
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
++    ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
++}
++
++
++// Set all done - signal nothing (used in missing refs)
++// Works for both rpi & non-rpi
++static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
++{
++    if (ref->tf.progress != NULL)
++    {
++        int * const p = (int *)ref->tf.progress->data;
++        p[0] = INT_MAX;
++        p[1] = INT_MAX;
++    }
++}
++
++#define HEVC_RPI_420_ONLY 1
++#define HEVC_RPI_SAND128_ONLY 1
++
++static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->hshift[cidx];
++#endif
++}
++
++static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
++{
++#if HEVC_RPI_420_ONLY
++    return cidx == 0 ? 0 : 1;
++#else
++    return s->ps.sps->vshift[cidx];
++#endif
++}
++
++static inline int ctx_cfmt(const HEVCRpiContext * const s)
++{
++#if HEVC_RPI_420_ONLY
++    return 1;
++#else
++    return s->ps.sps->chroma_format_idc;
++#endif
++}
++
++static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
++{
++#if HEVC_RPI_SAND128_ONLY
++    return 128;
++#else
++    return frame->linesize[c_idx];
++#endif
++}
++
++#if HEVC_RPI_SAND128_ONLY
++// Propagate this decision to later zc includes
++#define RPI_ZC_SAND128_ONLY 1
++#endif
++
++#ifndef ff_hevc_rpi_copy_vert
++static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
++                                         int pixel_shift, int height,
++                                         ptrdiff_t stride_dst, ptrdiff_t stride_src)
++{
++    int i;
++    switch (pixel_shift)
++    {
++        case 2:
++            for (i = 0; i < height; i++) {
++                *(uint32_t *)dst = *(uint32_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        case 1:
++            for (i = 0; i < height; i++) {
++                *(uint16_t *)dst = *(uint16_t *)src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++        default:
++            for (i = 0; i < height; i++) {
++                *dst = *src;
++                dst += stride_dst;
++                src += stride_src;
++            }
++            break;
++    }
++}
++#endif
++
++
++#if MVF_STASH_WIDTH == 64
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    return (HEVCRpiMvField *)((y < y0_ctb) ?
++        (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
++        (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
++            lc->mvf_stash +
++                ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
++                ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
++}
++
++#else
++static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++    return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
++}
++
++static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
++                               const unsigned int x0, const unsigned int y0,
++                               const unsigned int x, const unsigned int y)
++{
++    const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
++
++    const unsigned int x0_ctb = x0 & mask_cs_hi;
++    const unsigned int y0_ctb = y0 & mask_cs_hi;
++
++    // If not in the same CTB for Y assume up
++    if (y < y0_ctb) {
++        // If not in the same CTB for X too assume up-left
++        return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
++    }
++    return mvf_stash_ptr(s, lc, x, y);
++}
++
++static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
++                               const unsigned int x0,
++                               const unsigned int x)
++{
++    return MVF_STASH_WIDTH_PU;
++}
++#endif
++
++#endif /* AVCODEC_RPI_HEVCDEC_H */
+diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
+new file mode 100644
+index 0000000000..87f3cc9d14
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.c
+@@ -0,0 +1,450 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ * Copyright (C) 2018 John Cox, Ben Avison for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdsp.h"
++#include "rpi_hevc_mv.h"
++
++static const int8_t transform[32][32] = {
++    { 64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,
++      64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
++    { 90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4,
++      -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
++    { 90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90,
++     -90, -87, -80, -70, -57, -43, -25,  -9,   9,  25,  43,  57,  70,  80,  87,  90 },
++    { 90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
++      13,  38,  61,  78,  88,  90,  85,  73,  54,  31,   4, -22, -46, -67, -82, -90 },
++    { 89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89,
++      89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89 },
++    { 88,  67,  31, -13, -54, -82, -90, -78, -46, -4,   38,  73,  90,  85,  61,  22,
++     -22, -61, -85, -90, -73, -38,   4,  46,  78,  90,  82,  54,  13, -31, -67, -88 },
++    { 87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87,
++     -87, -57,  -9,  43,  80,  90,  70,  25, -25, -70, -90, -80, -43,   9,  57,  87 },
++    { 85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31,
++      31,  78,  90,  61,   4, -54, -88, -82, -38,  22,  73,  90,  67,  13, -46, -85 },
++    { 83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83,
++      83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83 },
++    { 82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38,
++     -38, -88, -73,  -4,  67,  90,  46, -31, -85, -78, -13,  61,  90,  54, -22, -82 },
++    { 80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80,
++     -80,  -9,  70,  87,  25, -57, -90, -43,  43,  90,  57, -25, -87, -70,   9,  80 },
++    { 78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46,
++      46,  90,  38, -54, -90, -31,  61,  88,  22, -67, -85, -13,  73,  82,   4, -78 },
++    { 75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75,
++      75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75 },
++    { 73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54,
++     -54, -85,   4,  88,  46, -61, -82,  13,  90,  38, -67, -78,  22,  90,  31, -73 },
++    { 70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70,
++     -70,  43,  87,  -9, -90, -25,  80,  57, -57, -80,  25,  90,   9, -87, -43,  70 },
++    { 67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61,
++      61,  73, -46, -82,  31,  88, -13, -90,  -4,  90,  22, -85, -38,  78,  54, -67 },
++    { 64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,
++      64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64 },
++    { 61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67,
++     -67, -54,  78,  38, -85, -22,  90,   4, -90,  13,  88, -31, -82,  46,  73, -61 },
++    { 57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57,
++     -57,  80,  25, -90,   9,  87, -43, -70,  70,  43, -87,  -9,  90, -25, -80,  57 },
++    { 54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73,
++      73,  31, -90,  22,  78, -67, -38,  90, -13, -82,  61,  46, -88,   4,  85, -54 },
++    { 50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50,
++      50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50 },
++    { 46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78,
++     -78,  -4,  82, -73, -13,  85, -67, -22,  88, -61, -31,  90, -54, -38,  90, -46 },
++    { 43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43,
++     -43,  90, -57, -25,  87, -70,  -9,  80, -80,   9,  70, -87,  25,  57, -90,  43 },
++    { 38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82,
++      82, -22, -54,  90, -61, -13,  78, -85,  31,  46, -90,  67,   4, -73,  88, -38 },
++    { 36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36,
++      36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36 },
++    { 31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85,
++     -85,  46,  13, -67,  90, -73,  22,  38, -82,  88, -54,  -4,  61, -90,  78, -31 },
++    { 25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25,
++     -25,  70, -90,  80, -43,  -9,  57, -87,  87, -57,   9,  43, -80,  90, -70,  25 },
++    { 22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88,
++      88, -67,  31,  13, -54,  82, -90,  78, -46,   4,  38, -73,  90, -85,  61, -22 },
++    { 18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18,
++      18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18 },
++    { 13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90,
++     -90,  82, -67,  46, -22,  -4,  31, -54,  73, -85,  90, -88,  78, -61,  38, -13 },
++    {  9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25, -9,
++      -9,  25, -43,  57, -70,  80, -87,  90, -90,  87, -80,  70, -57,  43, -25,   9 },
++    {  4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90,
++      90, -90,  88, -85,  82, -78,  73, -67,  61, -54,  46, -38,  31, -22,  13,  -4 },
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
++    { -2, 58, 10, -2},
++    { -4, 54, 16, -2},
++    { -6, 46, 28, -4},
++    { -4, 36, 36, -4},
++    { -4, 28, 46, -6},
++    { -2, 16, 54, -4},
++    { -2, 10, 58, -2},
++};
++
++DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
++    { -1,  4,-10, 58, 17, -5,  1,  0, -1,  4,-10, 58, 17, -5,  1,  0},
++    { -1,  4,-11, 40, 40,-11,  4, -1, -1,  4,-11, 40, 40,-11,  4, -1},
++    {  0,  1, -5, 17, 58,-10,  4, -1,  0,  1, -5, 17, 58,-10,  4, -1}
++};
++
++#define BIT_DEPTH 8
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcdsp_template.c"
++#undef BIT_DEPTH
++
++static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               int in_inc0, int in_inc1)
++{
++    int shift = 32;
++    uint32_t bs = 0;
++    for (; pus > 0; pus--) {
++        int strength, out;
++        int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
++        int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
++        int nr_idx0 = neigh->ref_idx[0];
++        int nr_idx1 = neigh->ref_idx[1];
++        int neigh_refL0 = neigh_rpl0[nr_idx0];
++        int neigh_refL1 = neigh_rpl1[nr_idx1];
++
++        av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
++        av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
++
++#if 1 // This more directly matches the original implementation
++        if (curr->pred_flag == PF_BI &&  neigh->pred_flag == PF_BI) {
++            // same L0 and L1
++            if (curr_refL0 == neigh_refL0 &&
++                curr_refL0 == curr_refL1 &&
++                neigh_refL0 == neigh_refL1) {
++                if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
++                    (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                     FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL0 == curr_refL0 &&
++                       neigh_refL1 == curr_refL1) {
++                if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else if (neigh_refL1 == curr_refL0 &&
++                       neigh_refL0 == curr_refL1) {
++                if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
++                    FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else {
++                strength = 1;
++            }
++        } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
++            MvXY curr_mv0, neigh_mv0;
++
++            if (curr->pred_flag & 1) {
++                curr_mv0   = curr->xy[0];
++            } else {
++                curr_mv0   = curr->xy[1];
++                curr_refL0 = curr_refL1;
++            }
++
++            if (neigh->pred_flag & 1) {
++                neigh_mv0   = neigh->xy[0];
++            } else {
++                neigh_mv0   = neigh->xy[1];
++                neigh_refL0 = neigh_refL1;
++            }
++
++            if (curr_refL0 == neigh_refL0) {
++                if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
++                    strength = 1;
++                else
++                    strength = 0;
++            } else
++                strength = 1;
++        } else
++            strength = 1;
++#else // This has exactly the same effect, but is more suitable for vectorisation
++        MvXY curr_mv[2];
++        MvXY neigh_mv[2];
++        memcpy(curr_mv, curr->xy, sizeof curr_mv);
++        memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
++
++        if (!(curr->pred_flag & 2)) {
++            curr_mv[1] = curr_mv[0];
++            curr_refL1 = curr_refL0;
++        }
++        if (!(neigh->pred_flag & 2)) {
++            neigh_mv[1] = neigh_mv[0];
++            neigh_refL1 = neigh_refL0;
++        }
++        if (!(curr->pred_flag & 1)) {
++            curr_mv[0] = curr_mv[1];
++            curr_refL0 = curr_refL1;
++        }
++        if (!(neigh->pred_flag & 1)) {
++            neigh_mv[0] = neigh_mv[1];
++            neigh_refL0 = neigh_refL1;
++        }
++
++        strength = 1;
++
++        strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
++
++        strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
++                (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
++                (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
++
++        strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
++#endif
++
++        curr += in_inc0 / sizeof (HEVCRpiMvField);
++        neigh += in_inc1 / sizeof (HEVCRpiMvField);
++
++        for (out = dup; out > 0; out--)
++        {
++            bs = (bs >> 2) | (strength << 30);
++            shift -= 2;
++        }
++    }
++    return bs >> shift;
++}
++
++
++static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
++{
++    unsigned int i, j;
++
++    if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=8)
++                AV_COPY64U(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    } else {
++        for (i = 0; i < height; i++) {
++            for (j = 0; j < width; j+=16)
++                AV_COPY128(dst+j, src+j);
++            dst += stride_dst;
++            src += stride_src;
++        }
++    }
++}
++
++
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef PEL_FUNC
++#define PEL_FUNC(dst1, idx1, idx2, a, depth)                                   \
++    for(i = 0 ; i < 10 ; i++)                                                  \
++{                                                                              \
++    hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth;                            \
++}
++
++#undef EPEL_FUNCS
++#define EPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth);                    \
++    PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
++
++#undef EPEL_UNI_FUNCS
++#define EPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
++
++#undef EPEL_BI_FUNCS
++#define EPEL_BI_FUNCS(depth)                                                \
++    PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);        \
++    PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth);            \
++    PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth);           \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth);        \
++    PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
++
++#undef QPEL_FUNCS
++#define QPEL_FUNCS(depth)                                                     \
++    PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth);                \
++    PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth);                    \
++    PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
++
++#undef QPEL_UNI_FUNCS
++#define QPEL_UNI_FUNCS(depth)                                                 \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth);            \
++    PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth);           \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth);    \
++    PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth);        \
++    PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
++
++#undef QPEL_BI_FUNCS
++#define QPEL_BI_FUNCS(depth)                                                  \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth);              \
++    PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth);             \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth);      \
++    PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth);          \
++    PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
++
++#define SLICED_ADD_RESIDUAL(depth)\
++    hevcdsp->add_residual_u[0]      = FUNC(add_residual4x4_u, depth);         \
++    hevcdsp->add_residual_u[1]      = FUNC(add_residual8x8_u, depth);         \
++    hevcdsp->add_residual_u[2]      = FUNC(add_residual16x16_u, depth);       \
++    hevcdsp->add_residual_u[3]      = FUNC(add_residual32x32_u, depth);       \
++    hevcdsp->add_residual_v[0]      = FUNC(add_residual4x4_v, depth);         \
++    hevcdsp->add_residual_v[1]      = FUNC(add_residual8x8_v, depth);         \
++    hevcdsp->add_residual_v[2]      = FUNC(add_residual16x16_v, depth);       \
++    hevcdsp->add_residual_v[3]      = FUNC(add_residual32x32_v, depth);       \
++    hevcdsp->add_residual_c[0]      = FUNC(add_residual4x4_c, depth);         \
++    hevcdsp->add_residual_c[1]      = FUNC(add_residual8x8_c, depth);         \
++    hevcdsp->add_residual_c[2]      = FUNC(add_residual16x16_c, depth);       \
++    hevcdsp->add_residual_c[3]      = FUNC(add_residual32x32_c, depth);       \
++    hevcdsp->add_residual_dc_c[0]   = FUNC(add_residual4x4_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[1]   = FUNC(add_residual8x8_dc_c, depth);         \
++    hevcdsp->add_residual_dc_c[2]   = FUNC(add_residual16x16_dc_c, depth);       \
++    hevcdsp->add_residual_dc_c[3]   = FUNC(add_residual32x32_dc_c, depth);       \
++    hevcdsp->put_pcm_c              = FUNC(put_pcm_c, depth)
++#define SLICED_LOOP_FILTERS(depth)\
++    hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
++    hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
++    hevcdsp->hevc_h_loop_filter_uv    = FUNC(hevc_h_loop_filter_uv, depth);    \
++    hevcdsp->hevc_v_loop_filter_uv2   = FUNC(hevc_v_loop_filter_uv2, depth)
++#define SLICED_SAO(depth)\
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                     \
++        hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth);       \
++        hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth);       \
++    }                                                                         \
++    hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth);       \
++    hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
++
++#define HEVC_DSP(depth)                                                     \
++    hevcdsp->put_pcm                = FUNC(put_pcm, depth);                 \
++    hevcdsp->add_residual[0]        = FUNC(add_residual4x4, depth);         \
++    hevcdsp->add_residual[1]        = FUNC(add_residual8x8, depth);         \
++    hevcdsp->add_residual[2]        = FUNC(add_residual16x16, depth);       \
++    hevcdsp->add_residual[3]        = FUNC(add_residual32x32, depth);       \
++    hevcdsp->add_residual_dc[0]     = FUNC(add_residual4x4_dc, depth);         \
++    hevcdsp->add_residual_dc[1]     = FUNC(add_residual8x8_dc, depth);         \
++    hevcdsp->add_residual_dc[2]     = FUNC(add_residual16x16_dc, depth);       \
++    hevcdsp->add_residual_dc[3]     = FUNC(add_residual32x32_dc, depth);       \
++    SLICED_ADD_RESIDUAL(depth);                                             \
++    hevcdsp->dequant                = FUNC(dequant, depth);                 \
++    hevcdsp->transform_rdpcm        = FUNC(transform_rdpcm, depth);         \
++    hevcdsp->transform_4x4_luma     = FUNC(transform_4x4_luma, depth);      \
++    hevcdsp->idct[0]                = FUNC(idct_4x4, depth);                \
++    hevcdsp->idct[1]                = FUNC(idct_8x8, depth);                \
++    hevcdsp->idct[2]                = FUNC(idct_16x16, depth);              \
++    hevcdsp->idct[3]                = FUNC(idct_32x32, depth);              \
++                                                                            \
++    hevcdsp->idct_dc[0]             = FUNC(idct_4x4_dc, depth);             \
++    hevcdsp->idct_dc[1]             = FUNC(idct_8x8_dc, depth);             \
++    hevcdsp->idct_dc[2]             = FUNC(idct_16x16_dc, depth);           \
++    hevcdsp->idct_dc[3]             = FUNC(idct_32x32_dc, depth);           \
++                                                                            \
++    for (i = 0; i != SAO_FILTER_N; ++i) {                                   \
++        hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth);         \
++        hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth);         \
++    }                                                                       \
++    hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth);            \
++    hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth);            \
++    SLICED_SAO(depth);                                                         \
++                                                                               \
++    QPEL_FUNCS(depth);                                                         \
++    QPEL_UNI_FUNCS(depth);                                                     \
++    QPEL_BI_FUNCS(depth);                                                      \
++    EPEL_FUNCS(depth);                                                         \
++    EPEL_UNI_FUNCS(depth);                                                     \
++    EPEL_BI_FUNCS(depth);                                                      \
++                                                                               \
++    SLICED_LOOP_FILTERS(depth);                                                \
++    hevcdsp->hevc_h_loop_filter_luma     = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma     = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma   = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma   = FUNC(hevc_v_loop_filter_chroma, depth); \
++    hevcdsp->hevc_h_loop_filter_luma_c   = FUNC(hevc_h_loop_filter_luma, depth);   \
++    hevcdsp->hevc_v_loop_filter_luma_c   = FUNC(hevc_v_loop_filter_luma, depth);   \
++    hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
++    hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
++int i = 0;
++
++    switch (bit_depth) {
++    case 9:
++        HEVC_DSP(9);
++        break;
++    case 10:
++        HEVC_DSP(10);
++        break;
++    case 12:
++        HEVC_DSP(12);
++        break;
++    default:
++        HEVC_DSP(8);
++        break;
++    }
++
++    hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
++    hevcdsp->cpy_blk = cpy_blk;
++
++    if (ARCH_PPC)
++        ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
++    if (ARCH_X86)
++        ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
++    if (ARCH_ARM)
++        ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
++    if (ARCH_MIPS)
++        ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
++}
+diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
+new file mode 100644
+index 0000000000..5a7cdeeb66
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp.h
+@@ -0,0 +1,177 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
++ *
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCDSP_H
++#define AVCODEC_RPI_HEVCDSP_H
++
++#include "hevc.h"
++#include "get_bits.h"
++
++struct HEVCRpiMvField;
++
++#define MAX_PB_SIZE 64
++
++#define RPI_HEVC_SAO_BUF_STRIDE 160
++
++
++typedef struct RpiSAOParams {
++    uint8_t band_position[3];   ///< sao_band_position (Y,U,V)
++    uint8_t eo_class[3];        ///< sao_eo_class      (Y,U=V)
++    uint8_t type_idx[3];        ///< sao_type_idx      (Y,U=V)
++
++    int16_t offset_val[3][5];   ///<SaoOffsetVal       (Y,U,V)
++
++} RpiSAOParams;
++
++
++// This controls how many sao dsp functions there are
++// N=5 has width = 8, 16, 32, 48, 64
++// N=6 adds a function for width=24 (in fn array el 5 so existing code should
++// still work)
++#define SAO_FILTER_N 6
++
++
++typedef struct HEVCDSPContext {
++    void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
++    void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
++    void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
++
++    void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
++    void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
++    void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
++                    struct GetBitContext *gb, int pcm_bit_depth);
++
++    void (*dequant)(int16_t *coeffs, int16_t log2_size);
++
++    void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
++
++    void (*transform_4x4_luma)(int16_t *coeffs);
++
++    void (*idct[4])(int16_t *coeffs, int col_limit);
++
++    void (*idct_dc[4])(int16_t *coeffs);
++
++    void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               int16_t *sao_offset_val, int sao_left_class, int width, int height);
++    void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                               const int16_t *sao_offset_val_u, int sao_left_class_u,
++                               const int16_t *sao_offset_val_v, int sao_left_class_v,
++                               int width, int height);
++
++    /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
++    void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               int16_t *sao_offset_val, int sao_eo_class, int width, int height);
++    void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
++                               const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
++
++    void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++    void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
++                                struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
++                                uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
++
++    void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int wx1,
++                                         int ox0, int ox1, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
++                                    int height, intptr_t mx, intptr_t my, int width);
++
++    void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, intptr_t mx, intptr_t my, int width);
++    void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, int denom, int wx0, int ox0, int wx1,
++                                         int ox1, intptr_t mx, intptr_t my, int width);
++
++    void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                    int beta, int32_t *tc,
++                                    uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                      int32_t *tc, uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
++                                      int beta, int32_t *tc,
++                                      uint8_t *no_p, uint8_t *no_q);
++    void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
++                                        int32_t *tc, uint8_t *no_p,
++                                        uint8_t *no_q);
++    void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
++    void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                                 uint8_t * _pix_l);
++    void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f);
++    void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f);
++
++    uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
++                                               const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
++                                               int in_inc0, int inc_inc1);
++
++    void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
++} HEVCDSPContext;
++
++void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
++
++extern const int8_t ff_hevc_rpi_epel_filters[7][4];
++extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
++
++void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
++void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
++void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
++#endif /* AVCODEC_RPI_HEVCDSP_H */
+diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
+new file mode 100644
+index 0000000000..dea4e55e4b
+--- /dev/null
++++ b/libavcodec/rpi_hevcdsp_template.c
+@@ -0,0 +1,2279 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "get_bits.h"
++#include "rpi_hevcdec.h"
++
++#include "bit_depth_template.c"
++#include "rpi_hevcdsp.h"
++
++#include "rpi_hevc_shader_template.h"
++
++static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
++                          GetBitContext *gb, int pcm_bit_depth)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++
++    dst = (pixel *)_dst + 1;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
++                                                ptrdiff_t stride, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size; x++) {
++            dst[x] = av_clip_pixel(dst[x] + dc);
++        }
++        dst += stride;
++    }
++}
++
++
++static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_v, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + *res);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, const int dc_u, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
++            res++;
++        }
++        dst += stride;
++    }
++}
++
++static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
++                                                ptrdiff_t stride, unsigned int size)
++{
++    unsigned int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int16_t * ru = res;
++    const int16_t * rv = res + size * size;
++
++//    rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
++//    rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
++//    rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
++        }
++        dst += stride;
++    }
++
++//    rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
++}
++
++
++static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    const int dc_v = dc >> 16;
++    const int dc_u = (dc << 16) >> 16;
++
++    stride /= sizeof(pixel);
++
++    for (y = 0; y < size; y++) {
++        for (x = 0; x < size * 2; x += 2) {
++            dst[x] = av_clip_pixel(dst[x] + dc_u);
++            dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
++        }
++        dst += stride;
++    }
++}
++
++
++static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual)(_dst, res, stride, 32);
++}
++
++static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
++{
++    FUNC(add_residual_dc)(_dst, stride, dc, 32);
++}
++
++// -- U -- (plaited)
++
++static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
++}
++
++static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
++}
++
++static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
++}
++
++static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_u)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- V -- (plaited)
++
++static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
++}
++
++static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
++}
++
++static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
++}
++
++static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride, int dc_v)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++// -- C -- (plaited - both U & V)
++
++static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 4);
++}
++
++static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
++                                  ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 8);
++}
++
++static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    FUNC(add_residual_c)(_dst, res, stride, 16);
++}
++
++static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
++                                    ptrdiff_t stride)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
++}
++
++static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
++}
++
++static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
++}
++
++static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
++{
++    // Should never occur for 420, which is all that sand supports
++    av_assert0(0);
++}
++
++
++static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
++{
++    int16_t *coeffs = (int16_t *) _coeffs;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (mode) {
++        coeffs += size;
++        for (y = 0; y < size - 1; y++) {
++            for (x = 0; x < size; x++)
++                coeffs[x] += coeffs[x - size];
++            coeffs += size;
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 1; x < size; x++)
++                coeffs[x] += coeffs[x - 1];
++            coeffs += size;
++        }
++    }
++}
++
++static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
++{
++    int shift  = 15 - BIT_DEPTH - log2_size;
++    int x, y;
++    int size = 1 << log2_size;
++
++    if (shift > 0) {
++        int offset = 1 << (shift - 1);
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = (*coeffs + offset) >> shift;
++                coeffs++;
++            }
++        }
++    } else {
++        for (y = 0; y < size; y++) {
++            for (x = 0; x < size; x++) {
++                *coeffs = *coeffs << -shift;
++                coeffs++;
++            }
++        }
++    }
++}
++
++#define SET(dst, x)   (dst) = (x)
++#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
++
++#define TR_4x4_LUMA(dst, src, step, assign)                             \
++    do {                                                                \
++        int c0 = src[0 * step] + src[2 * step];                         \
++        int c1 = src[2 * step] + src[3 * step];                         \
++        int c2 = src[0 * step] - src[3 * step];                         \
++        int c3 = 74 * src[1 * step];                                    \
++                                                                        \
++        assign(dst[2 * step], 74 * (src[0 * step] -                     \
++                                    src[2 * step] +                     \
++                                    src[3 * step]));                    \
++        assign(dst[0 * step], 29 * c0 + 55 * c1 + c3);                  \
++        assign(dst[1 * step], 55 * c2 - 29 * c1 + c3);                  \
++        assign(dst[3 * step], 55 * c0 + 29 * c2 - c3);                  \
++    } while (0)
++
++static void FUNC(transform_4x4_luma)(int16_t *coeffs)
++{
++    int i;
++    int shift    = 7;
++    int add      = 1 << (shift - 1);
++    int16_t *src = coeffs;
++
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(src, src, 4, SCALE);
++        src++;
++    }
++
++    shift = 20 - BIT_DEPTH;
++    add   = 1 << (shift - 1);
++    for (i = 0; i < 4; i++) {
++        TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
++        coeffs += 4;
++    }
++}
++
++#undef TR_4x4_LUMA
++
++#define TR_4(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
++        const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
++        const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
++        const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
++                                                                  \
++        assign(dst[0 * dstep], e0 + o0);                          \
++        assign(dst[1 * dstep], e1 + o1);                          \
++        assign(dst[2 * dstep], e1 - o1);                          \
++        assign(dst[3 * dstep], e0 - o0);                          \
++    } while (0)
++
++#define TR_8(dst, src, dstep, sstep, assign, end)                 \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_8[4];                                               \
++        int o_8[4] = { 0 };                                       \
++        for (i = 0; i < 4; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_8[i] += transform[4 * j][i] * src[j * sstep];   \
++        TR_4(e_8, src, 1, 2 * sstep, SET, 4);                     \
++                                                                  \
++        for (i = 0; i < 4; i++) {                                 \
++            assign(dst[i * dstep], e_8[i] + o_8[i]);              \
++            assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]);        \
++        }                                                         \
++    } while (0)
++
++#define TR_16(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_16[8];                                              \
++        int o_16[8] = { 0 };                                      \
++        for (i = 0; i < 8; i++)                                   \
++            for (j = 1; j < end; j += 2)                          \
++                o_16[i] += transform[2 * j][i] * src[j * sstep];  \
++        TR_8(e_16, src, 1, 2 * sstep, SET, 8);                    \
++                                                                  \
++        for (i = 0; i < 8; i++) {                                 \
++            assign(dst[i * dstep], e_16[i] + o_16[i]);            \
++            assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]);     \
++        }                                                         \
++    } while (0)
++
++#define TR_32(dst, src, dstep, sstep, assign, end)                \
++    do {                                                          \
++        int i, j;                                                 \
++        int e_32[16];                                             \
++        int o_32[16] = { 0 };                                     \
++        for (i = 0; i < 16; i++)                                  \
++            for (j = 1; j < end; j += 2)                          \
++                o_32[i] += transform[j][i] * src[j * sstep];      \
++        TR_16(e_32, src, 1, 2 * sstep, SET, end / 2);             \
++                                                                  \
++        for (i = 0; i < 16; i++) {                                \
++            assign(dst[i * dstep], e_32[i] + o_32[i]);            \
++            assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]);     \
++        }                                                         \
++    } while (0)
++
++#define IDCT_VAR4(H)                                              \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR8(H)                                              \
++    int limit  = FFMIN(col_limit, H);                             \
++    int limit2 = FFMIN(col_limit + 4, H)
++#define IDCT_VAR16(H)   IDCT_VAR8(H)
++#define IDCT_VAR32(H)   IDCT_VAR8(H)
++
++#define IDCT(H)                                                   \
++static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs,          \
++                                        int col_limit)            \
++{                                                                 \
++    int i;                                                        \
++    int      shift = 7;                                           \
++    int      add   = 1 << (shift - 1);                            \
++    int16_t *src   = coeffs;                                      \
++    IDCT_VAR ## H(H);                                             \
++                                                                  \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(src, src, H, H, SCALE, limit2);                  \
++        if (limit2 < H && i%4 == 0 && !!i)                        \
++            limit2 -= 4;                                          \
++        src++;                                                    \
++    }                                                             \
++                                                                  \
++    shift = 20 - BIT_DEPTH;                                       \
++    add   = 1 << (shift - 1);                                     \
++    for (i = 0; i < H; i++) {                                     \
++        TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit);             \
++        coeffs += H;                                              \
++    }                                                             \
++}
++
++#define IDCT_DC(H)                                                \
++static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs)    \
++{                                                                 \
++    int i, j;                                                     \
++    int shift = 14 - BIT_DEPTH;                                   \
++    int add   = 1 << (shift - 1);                                 \
++    int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift;          \
++                                                                  \
++    for (j = 0; j < H; j++) {                                     \
++        for (i = 0; i < H; i++) {                                 \
++            coeffs[i + j * H] = coeff;                            \
++        }                                                         \
++    }                                                             \
++}
++
++IDCT( 4)
++IDCT( 8)
++IDCT(16)
++IDCT(32)
++
++IDCT_DC( 4)
++IDCT_DC( 8)
++IDCT_DC(16)
++IDCT_DC(32)
++
++#undef TR_4
++#undef TR_8
++#undef TR_16
++#undef TR_32
++
++#undef SET
++#undef SCALE
++
++static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  int16_t *sao_offset_val, int sao_left_class,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    for (k = 0; k < 4; k++)
++        offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
++
++static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++    stride_dst /= sizeof(pixel);
++
++    a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            int diff0 = CMP(src[x], src[x + a_stride]);
++            int diff1 = CMP(src[x], src[x + b_stride]);
++            int offset_val        = edge_idx[2 + diff0 + diff1];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++
++
++#if BIT_DEPTH == 10
++// We need a 32 bit variation for the _c restores so hijack bit depth 10
++#undef pixel
++#undef BIT_DEPTH
++#define pixel uint32_t
++#define BIT_DEPTH 32
++// All 16 bit variations are the same
++#define sao_edge_restore_0_10 sao_edge_restore_0_9
++#define sao_edge_restore_1_10 sao_edge_restore_1_9
++#define sao_edge_restore_0_11 sao_edge_restore_0_9
++#define sao_edge_restore_1_11 sao_edge_restore_1_9
++#define sao_edge_restore_0_12 sao_edge_restore_0_9
++#define sao_edge_restore_1_12 sao_edge_restore_1_9
++#define sao_edge_restore_0_13 sao_edge_restore_0_9
++#define sao_edge_restore_1_13 sao_edge_restore_1_9
++#define sao_edge_restore_0_14 sao_edge_restore_0_9
++#define sao_edge_restore_1_14 sao_edge_restore_1_9
++#define sao_edge_restore_0_15 sao_edge_restore_0_9
++#define sao_edge_restore_1_15 sao_edge_restore_1_9
++#define sao_edge_restore_0_16 sao_edge_restore_0_9
++#define sao_edge_restore_1_16 sao_edge_restore_1_9
++#endif
++#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
++static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++}
++
++static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
++                                    ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
++                                    int *borders, int _width, int _height,
++                                    int c_idx, uint8_t *vert_edge,
++                                    uint8_t *horiz_edge, uint8_t *diag_edge)
++{
++    int x, y;
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int sao_eo_class    = sao->eo_class[c_idx];
++    int init_x = 0, init_y = 0, width = _width, height = _height;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++
++    if (sao_eo_class != SAO_EO_VERT) {
++        if (borders[0]) {
++            for (y = 0; y < height; y++) {
++                dst[y * stride_dst] = src[y * stride_src];
++            }
++            init_x = 1;
++        }
++        if (borders[2]) {
++            int offset     = width - 1;
++            for (x = 0; x < height; x++) {
++                dst[x * stride_dst + offset] = src[x * stride_src + offset];
++            }
++            width--;
++        }
++    }
++    if (sao_eo_class != SAO_EO_HORIZ) {
++        if (borders[1]) {
++            for (x = init_x; x < width; x++)
++                dst[x] = src[x];
++            init_y = 1;
++        }
++        if (borders[3]) {
++            ptrdiff_t y_stride_dst = stride_dst * (height - 1);
++            ptrdiff_t y_stride_src = stride_src * (height - 1);
++            for (x = init_x; x < width; x++)
++                dst[x + y_stride_dst] = src[x + y_stride_src];
++            height--;
++        }
++    }
++
++    {
++        int save_upper_left  = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
++        int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D  && !borders[1] && !borders[2];
++        int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
++        int save_lower_left  = !diag_edge[3] && sao_eo_class == SAO_EO_45D  && !borders[0] && !borders[3];
++
++        // Restore pixels that can't be modified
++        if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
++                dst[y*stride_dst] = src[y*stride_src];
++        }
++        if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
++            for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
++                dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
++        }
++
++        if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
++                dst[x] = src[x];
++        }
++        if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
++            for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
++                dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
++        }
++        if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
++            dst[0] = src[0];
++        if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
++            dst[width-1] = src[width-1];
++        if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
++            dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
++        if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
++            dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
++
++    }
++}
++#endif
++#if BIT_DEPTH == 32
++#undef BIT_DEPTH
++#undef pixel
++#define BIT_DEPTH 10
++#define pixel uint16_t
++#endif
++
++// --- Plaited chroma versions
++
++static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
++                                  ptrdiff_t stride_dst, ptrdiff_t stride_src,
++                                  const int16_t *sao_offset_val_u, int sao_left_class_u,
++                                  const int16_t *sao_offset_val_v, int sao_left_class_v,
++                                  int width, int height)
++{
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int offset_table_u[32] = { 0 };
++    int offset_table_v[32] = { 0 };
++    int k, y, x;
++    int shift  = BIT_DEPTH - 5;
++
++    stride_dst /= sizeof(pixel);
++    stride_src /= sizeof(pixel);
++    width *= 2;
++
++    for (k = 0; k < 4; k++)
++    {
++        offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
++        offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
++    }
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2)
++        {
++//            printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
++//            printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
++            // *** & 31 shouldn't be wanted but just now we generate broken input that
++            // crashes us in 10-bit world
++            dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
++            dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
++        }
++        dst += stride_dst;
++        src += stride_src;
++    }
++}
++
++static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
++                                  const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
++                                  int eo, int width, int height) {
++
++    static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
++    static const int8_t pos[4][2][2] = {
++        { { -1,  0 }, {  1, 0 } }, // horizontal
++        { {  0, -1 }, {  0, 1 } }, // vertical
++        { { -1, -1 }, {  1, 1 } }, // 45 degree
++        { {  1, -1 }, { -1, 1 } }, // 135 degree
++    };
++    pixel *dst = (pixel *)_dst;
++    pixel *src = (pixel *)_src;
++    int a_stride, b_stride;
++    int x, y;
++    const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
++
++    stride_dst /= sizeof(pixel);
++    width *= 2;
++
++    av_assert0(width <= 64);
++
++    a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
++    b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x += 2) {
++            int diff0u = CMP(src[x], src[x + a_stride]);
++            int diff1u = CMP(src[x], src[x + b_stride]);
++            int offset_valu        = edge_idx[2 + diff0u + diff1u];
++            int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
++            int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
++            int offset_valv        = edge_idx[2 + diff0v + diff1v];
++            dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
++            dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
++        }
++        src += stride_src;
++        dst += stride_dst;
++    }
++}
++
++// Do once
++#if BIT_DEPTH == 8
++// Any old 2 byte 'normal' restore will work for these
++#define sao_edge_restore_c_0_8  sao_edge_restore_0_16
++#define sao_edge_restore_c_1_8  sao_edge_restore_1_16
++// We need 32 bit for 9 bit+
++#define sao_edge_restore_c_0_9  sao_edge_restore_0_32
++#define sao_edge_restore_c_1_9  sao_edge_restore_1_32
++#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
++#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
++#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
++#endif
++
++#undef CMP
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = src[x] << (14 - BIT_DEPTH);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                          int height, intptr_t mx, intptr_t my, int width)
++{
++    int y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    for (y = 0; y < height; y++) {
++        memcpy(dst, src, width * sizeof(pixel));
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int16_t *src2,
++                                         int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                            int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                           int16_t *src2,
++                                           int height, int denom, int wx0, int wx1,
++                                           int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src          = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
++        }
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define QPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - 3 * stride] +                                         \
++     filter[1] * src[x - 2 * stride] +                                         \
++     filter[2] * src[x -     stride] +                                         \
++     filter[3] * src[x             ] +                                         \
++     filter[4] * src[x +     stride] +                                         \
++     filter[5] * src[x + 2 * stride] +                                         \
++     filter[6] * src[x + 3 * stride] +                                         \
++     filter[7] * src[x + 4 * stride])
++
++static void FUNC(put_hevc_qpel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++)  {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
++                                   uint8_t *_src,
++                                   ptrdiff_t _srcstride,
++                                   int height, intptr_t mx,
++                                   intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                      uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++
++    int shift = 14  + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                     uint8_t *_src, ptrdiff_t _srcstride,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++
++static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                       uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift =  14 - BIT_DEPTH;
++
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[mx - 1];
++
++    int shift = 14  + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                        uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox,
++                                        intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel        *src       = (pixel*)_src;
++    ptrdiff_t     srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++
++    const int8_t *filter    = ff_hevc_rpi_qpel_filters[my - 1];
++
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst,  ptrdiff_t _dststride,
++                                         uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox,
++                                         intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    ox = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    const int8_t *filter;
++    pixel *src = (pixel*)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src   -= QPEL_EXTRA_BEFORE * srcstride;
++    filter = ff_hevc_rpi_qpel_filters[mx - 1];
++    for (y = 0; y < height + QPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp    = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_qpel_filters[my - 1];
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++////////////////////////////////////////////////////////////////////////////////
++//
++////////////////////////////////////////////////////////////////////////////////
++#define EPEL_FILTER(src, stride)                                               \
++    (filter[0] * src[x - stride] +                                             \
++     filter[1] * src[x]          +                                             \
++     filter[2] * src[x + stride] +                                             \
++     filter[3] * src[x + 2 * stride])
++
++static void FUNC(put_hevc_epel_h)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_v)(int16_t *dst,
++                                  uint8_t *_src, ptrdiff_t _srcstride,
++                                  int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_hv)(int16_t *dst,
++                                   uint8_t *_src, ptrdiff_t _srcstride,
++                                   int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
++        tmp += MAX_PB_SIZE;
++        dst += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        }
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
++        src += srcstride;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                     int16_t *src2,
++                                     int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
++        dst  += dststride;
++        src  += srcstride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                      int16_t *src2,
++                                      int height, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++) {
++            dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
++        }
++        dst += dststride;
++        src += srcstride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                       int16_t *src2,
++                                       int height, int denom, int wx0, int wx1,
++                                       int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride  = _srcstride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
++        src  += srcstride;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                         int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = denom + 14 - BIT_DEPTH;
++#if BIT_DEPTH < 14
++    int offset = 1 << (shift - 1);
++#else
++    int offset = 0;
++#endif
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    ox     = ox * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
++        tmp += MAX_PB_SIZE;
++        dst += dststride;
++    }
++}
++
++static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
++                                        int16_t *src2,
++                                        int height, int denom, int wx0, int wx1,
++                                        int ox0, int ox1, intptr_t mx, intptr_t my, int width)
++{
++    int x, y;
++    pixel *src = (pixel *)_src;
++    ptrdiff_t srcstride = _srcstride / sizeof(pixel);
++    pixel *dst          = (pixel *)_dst;
++    ptrdiff_t dststride = _dststride / sizeof(pixel);
++    const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
++    int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
++    int16_t *tmp = tmp_array;
++    int shift = 14 + 1 - BIT_DEPTH;
++    int log2Wd = denom + shift - 1;
++
++    src -= EPEL_EXTRA_BEFORE * srcstride;
++
++    for (y = 0; y < height + EPEL_EXTRA; y++) {
++        for (x = 0; x < width; x++)
++            tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
++        src += srcstride;
++        tmp += MAX_PB_SIZE;
++    }
++
++    tmp      = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
++    filter = ff_hevc_rpi_epel_filters[my - 1];
++
++    ox0     = ox0 * (1 << (BIT_DEPTH - 8));
++    ox1     = ox1 * (1 << (BIT_DEPTH - 8));
++    for (y = 0; y < height; y++) {
++        for (x = 0; x < width; x++)
++            dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
++                                    ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
++        tmp  += MAX_PB_SIZE;
++        dst  += dststride;
++        src2 += MAX_PB_SIZE;
++    }
++}
++
++// line zero
++#define P3 pix[-4 * xstride]
++#define P2 pix[-3 * xstride]
++#define P1 pix[-2 * xstride]
++#define P0 pix[-1 * xstride]
++#define Q0 pix[0 * xstride]
++#define Q1 pix[1 * xstride]
++#define Q2 pix[2 * xstride]
++#define Q3 pix[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix[-4 * xstride + 3 * ystride]
++#define TP2 pix[-3 * xstride + 3 * ystride]
++#define TP1 pix[-2 * xstride + 3 * ystride]
++#define TP0 pix[-1 * xstride + 3 * ystride]
++#define TQ0 pix[0  * xstride + 3 * ystride]
++#define TQ1 pix[1  * xstride + 3 * ystride]
++#define TQ2 pix[2  * xstride + 3 * ystride]
++#define TQ3 pix[3  * xstride + 3 * ystride]
++
++static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
++                                        ptrdiff_t _xstride, ptrdiff_t _ystride,
++                                        int beta, int *_tc,
++                                        uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = _tc[j]   << (BIT_DEPTH - 8);
++        const int no_p = _no_p[j];
++        const int no_q = _no_q[j];
++
++        if (d0 + d3 >= beta) {
++            pix += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix += ystride;
++                }
++            }
++        }
++    }
++}
++
++static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, int *_tc,
++                                          uint8_t *_no_p, uint8_t *_no_q)
++{
++    int d, j, no_p, no_q;
++    pixel *pix        = (pixel *)_pix;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
++                                            int32_t *tc, uint8_t *no_p,
++                                            uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
++                                beta, tc, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
++                                          int beta, int32_t *tc, uint8_t *no_p,
++                                          uint8_t *no_q)
++{
++    FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
++                                beta, tc, no_p, no_q);
++}
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++// line zero
++#define P3 pix_l[0 * xstride]
++#define P2 pix_l[1 * xstride]
++#define P1 pix_l[2 * xstride]
++#define P0 pix_l[3 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++#define Q2 pix_r[2 * xstride]
++#define Q3 pix_r[3 * xstride]
++
++// line three. used only for deblocking decision
++#define TP3 pix_l[0 * xstride + 3 * ystride]
++#define TP2 pix_l[1 * xstride + 3 * ystride]
++#define TP1 pix_l[2 * xstride + 3 * ystride]
++#define TP0 pix_l[3 * xstride + 3 * ystride]
++#define TQ0 pix_r[0 * xstride + 3 * ystride]
++#define TQ1 pix_r[1 * xstride + 3 * ystride]
++#define TQ2 pix_r[2 * xstride + 3 * ystride]
++#define TQ3 pix_r[3 * xstride + 3 * ystride]
++
++// This is identical to hevc_loop_filter_luma except that the P/Q
++// components are on separate pointers
++static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
++                                 uint8_t * _pix_l)
++{
++    int d, j;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    const ptrdiff_t xstride = 1;
++    const ptrdiff_t ystride = _stride / sizeof(pixel);
++
++    beta <<= BIT_DEPTH - 8;
++
++    for (j = 0; j < 2; j++) {
++        const int dp0  = abs(P2  - 2 * P1  + P0);
++        const int dq0  = abs(Q2  - 2 * Q1  + Q0);
++        const int dp3  = abs(TP2 - 2 * TP1 + TP0);
++        const int dq3  = abs(TQ2 - 2 * TQ1 + TQ0);
++        const int d0   = dp0 + dq0;
++        const int d3   = dp3 + dq3;
++        const int tc   = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
++        const int no_p = no_f & 1;
++        const int no_q = no_f & 2;
++
++        if (d0 + d3 >= beta) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        } else {
++            const int beta_3 = beta >> 3;
++            const int beta_2 = beta >> 2;
++            const int tc25   = ((tc * 5 + 1) >> 1);
++
++            if (abs(P3  -  P0) + abs(Q3  -  Q0) < beta_3 && abs(P0  -  Q0) < tc25 &&
++                abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
++                                      (d0 << 1) < beta_2 &&      (d3 << 1) < beta_2) {
++                // strong filtering
++                const int tc2 = tc << 1;
++                for (d = 0; d < 4; d++) {
++                    const int p3 = P3;
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    const int q3 = Q3;
++                    if (!no_p) {
++                        P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
++                        P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
++                        P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
++                    }
++                    if (!no_q) {
++                        Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
++                        Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
++                        Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            } else { // normal filtering
++                int nd_p = 1;
++                int nd_q = 1;
++                const int tc_2 = tc >> 1;
++                if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
++                    nd_p = 2;
++                if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
++                    nd_q = 2;
++
++                for (d = 0; d < 4; d++) {
++                    const int p2 = P2;
++                    const int p1 = P1;
++                    const int p0 = P0;
++                    const int q0 = Q0;
++                    const int q1 = Q1;
++                    const int q2 = Q2;
++                    int delta0   = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
++                    if (abs(delta0) < 10 * tc) {
++                        delta0 = av_clip(delta0, -tc, tc);
++                        if (!no_p)
++                            P0 = av_clip_pixel(p0 + delta0);
++                        if (!no_q)
++                            Q0 = av_clip_pixel(q0 - delta0);
++                        if (!no_p && nd_p > 1) {
++                            const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
++                            P1 = av_clip_pixel(p1 + deltap1);
++                        }
++                        if (!no_q && nd_q > 1) {
++                            const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
++                            Q1 = av_clip_pixel(q1 + deltaq1);
++                        }
++                    }
++                    pix_l += ystride;
++                    pix_r += ystride;
++                }
++            }
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
++                                 unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
++{
++    // Just call the non-2 function having massaged the parameters
++    int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
++    uint8_t no_p[2] = {no_f & 1, no_f & 1};
++    uint8_t no_q[2] = {no_f & 2, no_f & 2};
++    FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
++}
++
++#undef TP3
++#undef TP2
++#undef TP1
++#undef TP0
++#undef TQ0
++#undef TQ1
++#undef TQ2
++#undef TQ3
++
++#undef P3
++#undef P2
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++#undef Q2
++#undef Q3
++
++#define P1 pix_l[0 * xstride]
++#define P0 pix_l[1 * xstride]
++#define Q0 pix_r[0 * xstride]
++#define Q1 pix_r[1 * xstride]
++
++static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
++                                          ptrdiff_t _ystride, const int32_t *_tc,
++                                          const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
++{
++    int d, j, no_p, no_q;
++    pixel *pix_l        = (pixel *)_pix_l;
++    pixel *pix_r        = (pixel *)_pix_r;
++    ptrdiff_t xstride = _xstride / sizeof(pixel);
++    ptrdiff_t ystride = _ystride / sizeof(pixel);
++
++    for (j = 0; j < 2; j++) {
++        const int tc = _tc[j] << (BIT_DEPTH - 8);
++        if (tc <= 0) {
++            pix_l += 4 * ystride;
++            pix_r += 4 * ystride;
++            continue;
++        }
++        no_p = _no_p[j];
++        no_q = _no_q[j];
++
++        for (d = 0; d < 4; d++) {
++            int delta0;
++            const int p1 = P1;
++            const int p0 = P0;
++            const int q0 = Q0;
++            const int q1 = Q1;
++            delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
++            if (!no_p)
++                P0 = av_clip_pixel(p0 + delta0);
++            if (!no_q)
++                Q0 = av_clip_pixel(q0 - delta0);
++            pix_l += ystride;
++            pix_r += ystride;
++        }
++    }
++}
++
++static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
++    FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
++}
++
++static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
++                                 uint8_t * src_l,
++                                 unsigned int no_f)
++{
++    uint8_t no_p[2] = {no_f & 1, no_f & 2};
++    uint8_t no_q[2] = {no_f & 4, no_f & 8};
++    int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
++    FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
++    FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
++}
++
++#undef P1
++#undef P0
++#undef Q0
++#undef Q1
++
+diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
+new file mode 100644
+index 0000000000..0aa8809a4b
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.c
+@@ -0,0 +1,161 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ * Copyright (C) 2018 John Cox for Raspberry Pi (Trading)
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "rpi_hevcdec.h"
++
++#include "rpi_hevcpred.h"
++#if (ARCH_ARM)
++#include "arm/rpi_hevcpred_arm.h"
++#endif
++
++#define PRED_C 0
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++#define PRED_C 1
++#define BIT_DEPTH 8
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 9
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 10
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++
++#define BIT_DEPTH 12
++#include "rpi_hevcpred_template.c"
++#undef BIT_DEPTH
++#undef PRED_C
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
++{
++#undef FUNC
++#define FUNC(a, depth) a ## _ ## depth
++
++#undef FUNCC
++#define FUNCC(a, depth) a ## _ ## depth ## _c
++
++#define HEVC_PRED_Y(depth)                                \
++    hpc->intra_pred      = FUNC(intra_pred, depth);     \
++    hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
++    hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
++    hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
++    hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
++    hpc->pred_planar[0]  = FUNC(pred_planar_0, depth);  \
++    hpc->pred_planar[1]  = FUNC(pred_planar_1, depth);  \
++    hpc->pred_planar[2]  = FUNC(pred_planar_2, depth);  \
++    hpc->pred_planar[3]  = FUNC(pred_planar_3, depth);  \
++    hpc->pred_dc[0]      = FUNC(pred_dc_0, depth);      \
++    hpc->pred_dc[1]      = FUNC(pred_dc_1, depth);      \
++    hpc->pred_dc[2]      = FUNC(pred_dc_2, depth);      \
++    hpc->pred_dc[3]      = FUNC(pred_dc_3, depth);      \
++    hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
++    hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
++    hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
++    hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
++    hpc->pred_dc0[0]     = FUNC(pred_dc0_0, depth);     \
++    hpc->pred_dc0[1]     = FUNC(pred_dc0_1, depth);     \
++    hpc->pred_dc0[2]     = FUNC(pred_dc0_2, depth);     \
++    hpc->pred_dc0[3]     = FUNC(pred_dc0_3, depth);
++
++#define HEVC_PRED_C(depth)                                \
++    hpc->intra_pred_c      = FUNCC(intra_pred, depth);     \
++	hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
++	hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
++	hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
++	hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
++    hpc->pred_planar_c[0]  = FUNCC(pred_planar_0, depth);  \
++    hpc->pred_planar_c[1]  = FUNCC(pred_planar_1, depth);  \
++    hpc->pred_planar_c[2]  = FUNCC(pred_planar_2, depth);  \
++    hpc->pred_planar_c[3]  = FUNCC(pred_planar_3, depth);  \
++    hpc->pred_dc_c[0]      = FUNCC(pred_dc_0, depth);      \
++    hpc->pred_dc_c[1]      = FUNCC(pred_dc_1, depth);      \
++    hpc->pred_dc_c[2]      = FUNCC(pred_dc_2, depth);      \
++    hpc->pred_dc_c[3]      = FUNCC(pred_dc_3, depth);      \
++    hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
++    hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
++    hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
++    hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
++    hpc->pred_dc0_c[0]     = FUNCC(pred_dc0_0, depth);     \
++    hpc->pred_dc0_c[1]     = FUNCC(pred_dc0_1, depth);     \
++    hpc->pred_dc0_c[2]     = FUNCC(pred_dc0_2, depth);     \
++    hpc->pred_dc0_c[3]     = FUNCC(pred_dc0_3, depth);
++
++#define HEVC_PRED(depth) \
++    HEVC_PRED_Y(depth); \
++    HEVC_PRED_C(depth);
++
++    switch (bit_depth) {
++    case 9:
++        HEVC_PRED(9);
++        break;
++    case 10:
++        HEVC_PRED(10);
++        break;
++    case 12:
++        HEVC_PRED(12);
++        break;
++    default:
++        HEVC_PRED(8);
++        break;
++    }
++
++#if (ARCH_ARM)
++    ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
++#elif (ARCH_MIPS)
++    ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
++#endif
++}
+diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
+new file mode 100644
+index 0000000000..9f0edb8798
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred.h
+@@ -0,0 +1,123 @@
++/*
++ * HEVC video Decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#ifndef AVCODEC_RPI_HEVCPRED_H
++#define AVCODEC_RPI_HEVCPRED_H
++
++#include <stddef.h>
++#include <stdint.h>
++#include "config.h"
++
++struct HEVCRpiContext;
++struct HEVCRpiLocalContext;
++
++enum IntraPredMode {
++    INTRA_PLANAR = 0,
++    INTRA_DC,
++    INTRA_ANGULAR_2,
++    INTRA_ANGULAR_3,
++    INTRA_ANGULAR_4,
++    INTRA_ANGULAR_5,
++    INTRA_ANGULAR_6,
++    INTRA_ANGULAR_7,
++    INTRA_ANGULAR_8,
++    INTRA_ANGULAR_9,
++    INTRA_ANGULAR_10,
++    INTRA_ANGULAR_11,
++    INTRA_ANGULAR_12,
++    INTRA_ANGULAR_13,
++    INTRA_ANGULAR_14,
++    INTRA_ANGULAR_15,
++    INTRA_ANGULAR_16,
++    INTRA_ANGULAR_17,
++    INTRA_ANGULAR_18,
++    INTRA_ANGULAR_19,
++    INTRA_ANGULAR_20,
++    INTRA_ANGULAR_21,
++    INTRA_ANGULAR_22,
++    INTRA_ANGULAR_23,
++    INTRA_ANGULAR_24,
++    INTRA_ANGULAR_25,
++    INTRA_ANGULAR_26,
++    INTRA_ANGULAR_27,
++    INTRA_ANGULAR_28,
++    INTRA_ANGULAR_29,
++    INTRA_ANGULAR_30,
++    INTRA_ANGULAR_31,
++    INTRA_ANGULAR_32,
++    INTRA_ANGULAR_33,
++    INTRA_ANGULAR_34,
++};
++#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
++#define INTRA_ANGULAR_VERTICAL   INTRA_ANGULAR_26
++
++typedef void intra_filter_fn_t(
++        uint8_t * const left, uint8_t * const top,
++        const unsigned int req, const unsigned int avail,
++        const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
++        const unsigned int stride,
++        const unsigned int top_right_size, const unsigned int down_left_size);
++
++typedef struct HEVCRpiPredContext {
++    void (*intra_pred)(const struct HEVCRpiContext * const s,
++                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++                          const unsigned int avail, const unsigned int log2_size);
++
++    intra_filter_fn_t *intra_filter[4];
++    void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride);
++    void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
++
++    void (*intra_pred_c)(const struct HEVCRpiContext * const s,
++                          const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0,
++                          const unsigned int avail, const unsigned int log2_size);
++    intra_filter_fn_t *intra_filter_c[4];
++    void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
++                           const uint8_t *left, ptrdiff_t stride);
++    void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
++                    ptrdiff_t stride);
++    void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
++                            const uint8_t *left, ptrdiff_t stride,
++                            int mode);
++    void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
++} HEVCRpiPredContext;
++
++void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
++
++#endif /* AVCODEC_RPI_HEVCPRED_H */
+diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
+new file mode 100644
+index 0000000000..f2ebcad332
+--- /dev/null
++++ b/libavcodec/rpi_hevcpred_template.c
+@@ -0,0 +1,1407 @@
++/*
++ * HEVC video decoder
++ *
++ * Copyright (C) 2012 - 2013 Guillaume Martres
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "config.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/rpi_sand_fns.h"
++#include "bit_depth_template.c"
++
++#include "rpi_hevcdec.h"
++#include "rpi_hevcpred.h"
++
++#define DUMP_PRED 0
++
++#define POS(x, y) src[(x) + stride * (y)]
++
++// INCLUDED_ONCE defined at EOF
++#ifndef INCLUDED_ONCE
++typedef uint8_t (* c8_dst_ptr_t)[2];
++typedef const uint8_t (* c8_src_ptr_t)[2];
++typedef uint16_t (* c16_dst_ptr_t)[2];
++typedef const uint16_t (* c16_src_ptr_t)[2];
++
++// *** On ARM make these NEON registers
++typedef struct pixel4_16 {
++    uint16_t x[4];
++} pixel4_16;
++typedef struct pixel4_32 {
++    uint32_t x[4];
++} pixel4_32;
++static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
++{
++    pixel4_16 t = {{x, x, x, x}};
++    return t;
++}
++static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
++{
++    pixel4_32 t = {{x, x, x, x}};
++    return t;
++}
++#endif
++
++#if PRED_C
++// For chroma we double pixel size so we copy pairs
++#undef pixel
++#undef pixel2
++#undef pixel4
++#undef dctcoef
++#undef INIT_CLIP
++#undef no_rnd_avg_pixel4
++#undef rnd_avg_pixel4
++#undef AV_RN2P
++#undef AV_RN4P
++#undef AV_RN4PA
++#undef AV_WN2P
++#undef AV_WN4P
++#undef AV_WN4PA
++#undef CLIP
++#undef FUNC
++#undef FUNCC
++#undef av_clip_pixel
++#undef PIXEL_SPLAT_X4
++
++#if BIT_DEPTH == 8
++#define pixel uint16_t
++#define pixel4 pixel4_16
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
++#define cpel uint8_t
++#define c_src_ptr_t  c8_src_ptr_t
++#define c_dst_ptr_t  c8_dst_ptr_t
++#else
++#define pixel uint32_t
++#define pixel4 pixel4_32
++#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
++#define cpel uint16_t
++#define c_src_ptr_t c16_dst_ptr_t
++#define c_dst_ptr_t c16_dst_ptr_t
++#endif
++#define AV_RN4P(p) (*(pixel4*)(p))
++#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
++#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
++#endif
++
++
++// Get PW prior to horrid PRED_C trickery
++#if BIT_DEPTH == 8
++#define PW 1
++#else
++#define PW 2
++#endif
++
++
++#if DUMP_PRED && !defined(INCLUDED_ONCE)
++static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
++{
++    for (unsigned int y = 0; y != size; y++, data += stride * 2) {
++        for (unsigned int x = 0; x != size; x++) {
++            printf("%4d", data[x * 2]);
++        }
++        printf("\n");
++    }
++    printf("\n");
++}
++#endif
++
++#ifndef INCLUDED_ONCE
++static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t v4 = v | (v << 8);
++        uint32_t * p = (uint32_t *)ptr;
++        v4 = v4 | (v4 << 16);
++        do {
++            *p++ = v4;
++        } while (--n != 0);
++    }
++}
++
++static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t v2 = v | (v << 16);
++        uint32_t * p = (uint32_t *)ptr;
++        do {
++            *p++ = v2;
++            *p++ = v2;
++        } while (--n != 0);
++    }
++}
++
++static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
++{
++    if ((n >>= 2) != 0) {
++        uint32_t * p = (uint32_t *)ptr;
++        do {
++            *p++ = v;
++            *p++ = v;
++            *p++ = v;
++            *p++ = v;
++        } while (--n != 0);
++    }
++}
++
++// Beware that this inverts the avail ordering
++// For CIP it seems easier this way round
++static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                              unsigned int s0, unsigned int odd_s)
++{
++    const unsigned int n = 1 << log2_intra_bits;
++    unsigned int fa = 0;
++    unsigned int i;
++
++    size >>= 2;   // Now in 4-pel units
++    s0 >>= 2;
++
++    if ((avail & AVAIL_DL) != 0)
++        fa |= ((1 << s0) - 1) << (size - s0);
++    if ((avail & AVAIL_L) != 0)
++        fa |= ((1 << size) - 1) << size;
++    if ((avail & AVAIL_UL) != 0)
++        fa |= 1 << (size << 1);
++
++    if (odd_s) {
++        if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~1;
++        is_intra += i_stride;
++    }
++
++    for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
++        const unsigned int m = ((1 << n) - 1) << i;
++        if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
++            fa &= ~m;
++    }
++
++    return fa;
++}
++
++static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
++                                const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
++                                unsigned int s1, unsigned int odd_s)
++{
++    if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
++    {
++        return 0;
++    }
++    else
++    {
++        const unsigned int n = 1 << log2_intra_bits;
++        unsigned int fa = 0;
++        unsigned int i;
++        unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
++
++        size >>= 2;   // Now in 4-pel units
++        s1 >>= 2;
++
++        if ((avail & AVAIL_U) != 0)
++            fa |= ((1 << size) - 1);
++        if ((avail & AVAIL_UR) != 0)
++            fa |= ((1 << s1) - 1) << size;
++
++        if (odd_s) {
++            fa &= im | ~1;
++            im >>= 1;
++        }
++
++        for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
++            const unsigned int m = ((1 << n) - 1) << i;
++            if ((im & 1) == 0)
++                fa &= ~m;
++        }
++        return fa;
++    }
++}
++
++
++
++static inline unsigned int rmbd(unsigned int x)
++{
++#if 1
++    return __builtin_ctz(x);
++#else
++    unsigned int n = 0;
++    if ((x & 0xffff) == 0) {
++        x >>= 16;
++        n += 16;
++    }
++    if ((x & 0xff) == 0) {
++        x >>= 8;
++        n += 8;
++    }
++    if ((x & 0xf) == 0) {
++        x >>= 4;
++        n += 4;
++    }
++    if ((x & 0x3) == 0) {
++        x >>= 2;
++        n += 2;
++    }
++
++    return (x & 1) == 0 ? n + 1 : n;
++#endif
++}
++#endif
++
++
++static void FUNC(cip_fill)(pixel * const left, pixel * const top,
++    const unsigned int avail_l, const unsigned int avail_u,
++    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++    const unsigned int stride,
++    const unsigned int size)
++{
++    pixel a;
++    unsigned int i;
++
++    // 1st find DL value
++    if ((avail_l & 1) == 0) {
++        if (avail_l != 0)
++            a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
++        else
++        {
++            // (avail_l | avail_u) != 0 so this must be good
++            const unsigned int n = rmbd(avail_u)*4;
++            a = (n >= size) ? src_ur[n - size] : src_u[n];
++        }
++    }
++
++    // L
++    {
++        pixel * d = left + size * 2 - 1;
++        const pixel * s = src_l + (size * 2 - 1) * stride;
++        unsigned int x = avail_l;
++        for (i = 0; i < size * 2; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d-- = *s;
++                s -= stride;
++                *d-- = *s;
++                s -= stride;
++                *d-- = *s;
++                s -= stride;
++                *d-- = a = *s;
++                s -= stride;
++            }
++            else
++            {
++                *d-- = a;
++                *d-- = a;
++                *d-- = a;
++                *d-- = a;
++                s -= stride * 4;
++            }
++        }
++        // UL
++        *d = a = (x & 1) != 0 ? *s : a;
++    }
++
++    // U
++    {
++        pixel * d = top;
++        const pixel * s = src_u;
++        unsigned int x = avail_u;
++
++        for (i = 0; i < size; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = a = *s++;
++            }
++            else
++            {
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                s += 4;
++            }
++        }
++
++        // UR
++        s = src_ur;
++        for (i = 0; i < size; i += 4, x >>= 1)
++        {
++            if ((x & 1) != 0) {
++                // Avail
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = *s++;
++                *d++ = a = *s++;
++            }
++            else
++            {
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                *d++ = a;
++                s += 4;
++            }
++        }
++    }
++}
++
++
++#if !PRED_C && PW == 1
++#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
++#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
++#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
++#else
++#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
++#endif
++
++// Reqs:
++//
++// Planar:  DL[0], L, ul, U, UR[0]
++// DC:         dl, L, ul, U, ur
++// A2-9:       DL, L, ul, u, ur
++// A10:        dl, L, ul, u, ur
++// A11-17      dl, L, UL, U, ur
++// A18-25      dl, L, Ul, U, ur
++// A26         dl, l, ul, U, ur
++// A27-34      dl, l, ul, U, UR
++
++#ifndef INCLUDED_ONCE
++
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
++intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
++
++static const uint8_t req_avail_c[35] =
++{
++    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0         |  AVAIL_U,             // DC
++    AVAIL_DL | AVAIL_L,                                    // 2
++    AVAIL_DL | AVAIL_L,                                    // 3
++    AVAIL_DL | AVAIL_L,                                    // 4
++    AVAIL_DL | AVAIL_L,                                    // 5
++    AVAIL_DL | AVAIL_L,                                    // 6
++    AVAIL_DL | AVAIL_L,                                    // 7
++    AVAIL_DL | AVAIL_L,                                    // 8
++    AVAIL_DL | AVAIL_L,                                    // 9
++               AVAIL_L,                                    // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
++                                    AVAIL_U,               // 26 (V)
++                                    AVAIL_U | AVAIL_UR,    // 27
++                                    AVAIL_U | AVAIL_UR,    // 28
++                                    AVAIL_U | AVAIL_UR,    // 29
++                                    AVAIL_U | AVAIL_UR,    // 30
++                                    AVAIL_U | AVAIL_UR,    // 31
++                                    AVAIL_U | AVAIL_UR,    // 32
++                                    AVAIL_U | AVAIL_UR,    // 33
++                                    AVAIL_U | AVAIL_UR     // 34
++};
++
++static const uint8_t req_avail[4][35] = {
++{
++    AVAIL_DL | AVAIL_L | 0         |  AVAIL_U | AVAIL_UR,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0         |  AVAIL_U,             // DC
++    AVAIL_DL | AVAIL_L,                                    // 2
++    AVAIL_DL | AVAIL_L,                                    // 3
++    AVAIL_DL | AVAIL_L,                                    // 4
++    AVAIL_DL | AVAIL_L,                                    // 5
++    AVAIL_DL | AVAIL_L,                                    // 6
++    AVAIL_DL | AVAIL_L,                                    // 7
++    AVAIL_DL | AVAIL_L,                                    // 8
++    AVAIL_DL | AVAIL_L,                                    // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U,               // 26 (V)
++                                    AVAIL_U | AVAIL_UR,    // 27
++                                    AVAIL_U | AVAIL_UR,    // 28
++                                    AVAIL_U | AVAIL_UR,    // 29
++                                    AVAIL_U | AVAIL_UR,    // 30
++                                    AVAIL_U | AVAIL_UR,    // 31
++                                    AVAIL_U | AVAIL_UR,    // 32
++                                    AVAIL_U | AVAIL_UR,    // 33
++                                    AVAIL_U | AVAIL_UR     // 34
++},
++{  // 3
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
++    AVAIL_DL | AVAIL_L                                 | 0,             // 3
++    AVAIL_DL | AVAIL_L                                 | 0,             // 4
++    AVAIL_DL | AVAIL_L                                 | 0,             // 5
++    AVAIL_DL | AVAIL_L                                 | 0,             // 6
++    AVAIL_DL | AVAIL_L                                 | 0,             // 7
++    AVAIL_DL | AVAIL_L                                 | 0,             // 8
++    AVAIL_DL | AVAIL_L                                 | 0,             // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | 0,             // 27
++                                    AVAIL_U | AVAIL_UR | 0,             // 28
++                                    AVAIL_U | AVAIL_UR | 0,             // 29
++                                    AVAIL_U | AVAIL_UR | 0,             // 30
++                                    AVAIL_U | AVAIL_UR | 0,             // 31
++                                    AVAIL_U | AVAIL_UR | 0,             // 32
++                                    AVAIL_U | AVAIL_UR | 0,             // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
++},
++{  // 4
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 2
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 3
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 4
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 5
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 6
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 7
++    AVAIL_DL | AVAIL_L                                 | FILTER_LIGHT,  // 8
++    AVAIL_DL | AVAIL_L                                 | 0,             // 9
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_LIGHT,  // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 25
++               AVAIL_L | AVAIL_UL | AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | 0,             // 27
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 28
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 29
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 30
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 31
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 32
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT,  // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_LIGHT   // 34
++},
++{  // 5
++    AVAIL_DL | AVAIL_L | 0        | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
++               AVAIL_L | 0        | AVAIL_U,                            // DC
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 2
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 3
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 4
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 5
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 6
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 7
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 8
++    AVAIL_DL | AVAIL_L                                 | FILTER_EITHER, // 9
++               AVAIL_L                                 | 0,             // 10 (H)
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 11
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 12
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 13
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 14
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 15
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 16
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 17
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 18
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 19
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 20
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 21
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 22
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 23
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 24
++               AVAIL_L | AVAIL_UL | AVAIL_U            | FILTER_EITHER, // 25
++                                    AVAIL_U            | 0,             // 26 (V)
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
++                                    AVAIL_U | AVAIL_UR | FILTER_EITHER  // 34
++}
++};
++
++
++#endif
++
++#define filter_light1 FUNC(filter_light1)
++static inline pixel filter_light1(pixel a, pixel b, pixel c)
++{
++    return (a + b*2 + c + 2) >> 2;
++}
++
++#define filter_light FUNC(filter_light)
++static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
++{
++    pixel p0;
++    pixel p2 = *src;
++    // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
++    unsigned int n_minus_1 = n - 1;
++
++    do
++    {
++        src += sstride;
++        p0 = p1;
++        p1 = p2;
++        p2 = *src;
++        *dst++ = filter_light1(p0, p1, p2);
++    } while (--n_minus_1 != 0);
++    *dst = filter_light1(p1, p2, pn);
++}
++
++#define filter_strong FUNC(filter_strong)
++static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
++{
++    unsigned int a = 64 * p0 + 32;
++    const int v = p1 - p0;
++
++    do
++    {
++        *dst++ = (a += v) >> 6;
++    } while (--n != 0);
++}
++
++#define intra_filter FUNC(intra_filter)
++static av_always_inline void intra_filter(
++    pixel * const left, pixel * const top,
++    const unsigned int req, const unsigned int avail,
++    const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
++    const unsigned int stride,
++    const unsigned int top_right_size, const unsigned int down_left_size,
++    const unsigned int log2_size)
++{
++    const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
++    const unsigned int size = 1 << log2_size;
++
++    // a_ is the first pel in a section working round dl -> ur
++    // b_ is the last
++    // Beware that top & left work out from UL so usage of a_ & b_ may
++    // swap between them.  It is a bad naming scheme but I have found no
++    // better
++    const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
++    const pixel * b_dl = src_l + size * stride;
++    const pixel * a_l  = src_l + (size - 1) * stride;
++    const pixel * b_l  = src_l;
++    const pixel * ab_ul = src_l - stride;
++    const pixel * a_u = src_u;
++    const pixel * b_u = src_u + size - 1;
++    const pixel * a_ur = src_ur;
++    const pixel * b_ur = src_ur + top_right_size - 1;
++
++    const unsigned int want = req & ~avail;
++    const unsigned int have = req & avail;
++    unsigned int i;
++
++    if ((avail & AVAIL_DL) == 0)
++    {
++        a_dl = a_ur;
++        if ((avail & AVAIL_U) != 0)
++            a_dl = a_u;
++        if ((avail & AVAIL_UL) != 0)
++            a_dl = ab_ul;
++        if ((avail & AVAIL_L) != 0)
++            a_dl = a_l;
++        b_dl = a_dl;
++    }
++
++    if ((avail & AVAIL_L) == 0)
++    {
++        a_l = b_dl;
++        b_l = b_dl;
++    }
++    if ((avail & AVAIL_UL) == 0)
++    {
++        ab_ul = b_l;
++    }
++    if ((avail & AVAIL_U) == 0)
++    {
++        a_u = ab_ul;
++        b_u = ab_ul;
++    }
++    if ((avail & AVAIL_UR) == 0)
++    {
++        a_ur = b_u;
++        b_ur = b_u;
++    }
++
++    if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2)  // PRED_C, log2_size compiler opt hints
++    {
++        if ((req & AVAIL_UL) != 0)
++            left[-1] = *ab_ul;
++
++        if ((want & AVAIL_L) != 0)
++            EXTEND(left, *a_l, size);
++        if ((want & AVAIL_DL) != 0)
++            EXTEND(left + size, *a_dl, size);
++        if ((want & AVAIL_U) != 0)
++            EXTEND(top, *a_u, size);
++        if ((want & AVAIL_UR) != 0)
++            EXTEND(top + size, *a_ur, size);
++
++        if ((have & AVAIL_U) != 0)
++            // Always good - even with sand
++            memcpy(top, a_u, size * sizeof(pixel));
++        if ((have & AVAIL_UR) != 0)
++        {
++            memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
++            EXTEND(top + size + top_right_size, *b_ur,
++                   size - top_right_size);
++        }
++        if ((have & AVAIL_L) != 0)
++        {
++            for (i = 0; i < size; i++)
++                left[i] = b_l[stride * i];
++        }
++        if ((have & AVAIL_DL) != 0)
++        {
++            for (i = 0; i < down_left_size; i++)
++                left[i + size] = b_dl[stride * i];
++            EXTEND(left + size + down_left_size, *a_dl,
++                   size - down_left_size);
++        }
++    }
++    else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
++            FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
++            FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
++    {
++        if ((req & (AVAIL_U | AVAIL_UR)) != 0)
++            filter_strong(top, *ab_ul, *b_ur, size * 2);
++        left[-1] = *ab_ul;
++        if ((req & (AVAIL_L | AVAIL_DL)) != 0)
++            filter_strong(left, *ab_ul, *a_dl, size*2);
++    }
++    else
++    {
++        // Same code for both have & want for UL
++        if ((req & AVAIL_UL) != 0)
++        {
++            left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
++        }
++
++        if ((want & AVAIL_L) != 0)
++        {
++            EXTEND(left, *a_l, size);
++            left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
++        }
++        if ((want & AVAIL_DL) != 0)
++        {
++            // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
++            EXTEND(left + size, *a_l, size);
++        }
++        if ((want & AVAIL_U) != 0)
++        {
++            EXTEND(top, *a_u, size);
++            top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
++        }
++        if ((want & AVAIL_UR) != 0)
++        {
++            // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
++            EXTEND(top + size, *a_ur, size);
++        }
++
++        if ((have & AVAIL_U) != 0)
++        {
++            filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
++        }
++        if ((have & AVAIL_UR) != 0) {
++            filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
++            top[size*2 - 1] = *b_ur;
++            EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
++        }
++        if ((have & AVAIL_L) != 0)
++        {
++            filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
++        }
++        if ((have & AVAIL_DL) != 0)
++        {
++            filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
++            left[size*2 - 1] = *a_dl;
++            EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
++        }
++    }
++}
++
++#define INTRA_FILTER(log2_size) \
++static void FUNC(intra_filter_ ## log2_size)( \
++     uint8_t * const left, uint8_t * const top, \
++     const unsigned int req, const unsigned int avail, \
++     const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
++     const unsigned int stride, \
++     const unsigned int top_right_size, const unsigned int down_left_size) \
++{ \
++    intra_filter((pixel *)left, (pixel *)top, req, avail, \
++        (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
++}
++
++INTRA_FILTER(2)
++INTRA_FILTER(3)
++INTRA_FILTER(4)
++INTRA_FILTER(5)
++
++#undef intra_filter
++#undef INTRA_FILTER
++
++static void FUNC(intra_pred)(const HEVCRpiContext * const s,
++                                              const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
++                                              const unsigned int log2_size)
++{
++    // c_idx will alaways be 1 for _c versions and 0 for y
++    const unsigned int c_idx = PRED_C;
++    const unsigned int hshift = ctx_hshift(s, c_idx);
++    const unsigned int vshift = ctx_vshift(s, c_idx);
++    const unsigned int size = (1 << log2_size);
++    const unsigned int x = x0 >> hshift;
++    const unsigned int y = y0 >> vshift;
++
++    const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
++    pixel *const src = c_idx == 0 ?
++        (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
++        (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
++
++    // Align so we can do multiple loads in the asm
++    // Padded to 16 byte boundary so as not to confuse anything
++    DECLARE_ALIGNED(16, pixel, top[2 * MAX_TB_SIZE]);
++    DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
++
++    pixel  * const left  = left_array  + 16 / sizeof(pixel);
++    const pixel * top_pred = top;
++
++    const pixel * src_l = src - 1;
++    const pixel * src_u = src - stride;
++    const pixel * src_ur = src_u + size;
++#if !PRED_C
++    const unsigned int req = req_avail[log2_size - 2][mode] & ~s->ps.sps->intra_filters_disable;
++#else
++    const unsigned int req = req_avail_c[mode];
++#endif
++
++    // If we have nothing to pred from then fill with grey
++    // This isn't a common case but dealing with it here means we don't have to
++    // test for it later
++    if (avail == 0)
++    {
++dc_only:
++#if !PRED_C
++        s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
++#else
++        s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
++#endif
++        return;
++    }
++
++    {
++        // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
++        const AVFrame * const frame = s->frame;
++        const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
++        const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
++        if ((x & mask) == 0)
++            src_l -= stripe_adj;
++        if (((x + size) & mask) == 0)
++            src_ur += stripe_adj;
++    }
++
++    // Can deal with I-slices in 'normal' code even if CIP
++    // This also means that we don't need to generate (elsewhere) is_intra
++    // for IRAP frames
++    if (s->ps.pps->constrained_intra_pred_flag == 1 &&
++        s->sh.slice_type != HEVC_SLICE_I)
++    {
++        // * If we ever actually care about CIP performance then we should
++        //   special case out size 4 stuff (can be done by 'normal') and
++        //   have 8-pel avail masks
++        unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
++                                           -(int)(s->ps.sps->pcm_width),
++                                           1 << (((x - 1) >> (3 - hshift)) & 7),
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
++                                           vshift != 0 ? 0 : (y >> 2) & 1);
++
++        unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
++                                           (x >> (3 - hshift)) & 7,
++                                           1 - hshift,
++                                           avail,
++                                           size,
++                                           FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
++                                           hshift != 0 ? 0 : (x >> 2) & 1);
++
++        // Anything left?
++        if ((avail_l | avail_u) == 0)
++            goto dc_only;
++
++        FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
++
++#if !PRED_C
++        if ((req & FILTER_LIGHT) != 0)
++        {
++            const unsigned threshold = 1 << (BIT_DEPTH - 5);
++            if ((req & FILTER_STRONG) != 0 &&
++                (int)(FFABS(left[-1]  + top[63] - 2 * top[31]))  < threshold &&
++                (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
++            {
++                filter_strong(top, left[-1], top[63], 64);
++                filter_strong(left, left[-1], left[63], 64);
++            } else
++            {
++                // LHS writes UL too so copy for top
++                const pixel p_ul = left[-1];
++                filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
++                filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
++            }
++        }
++#endif
++    }
++    else
++    {
++        const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
++        if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
++            ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
++        {
++            top_pred = src_u;
++        }
++        else
++        {
++#if !PRED_C
++            s->hpc.intra_filter[log2_size - 2]
++#else
++            s->hpc.intra_filter_c[log2_size - 2]
++#endif
++                ((uint8_t *)left, (uint8_t *)top, req, avail,
++                 (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
++                              ur_size,
++                              FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
++        }
++    }
++
++
++#if !PRED_C
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                       (uint8_t *)left, stride);
++        break;
++    case INTRA_ANGULAR_HORIZONTAL:
++        s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    case INTRA_ANGULAR_VERTICAL:
++        s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    default:
++        s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    }
++#else
++    switch (mode) {
++    case INTRA_PLANAR:
++        s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                          (uint8_t *)left, stride);
++        break;
++    case INTRA_DC:
++        s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                       (uint8_t *)left, stride);
++        break;
++    case INTRA_ANGULAR_HORIZONTAL:
++        s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    case INTRA_ANGULAR_VERTICAL:
++        s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    default:
++        s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
++                                           (uint8_t *)left, stride,
++                                           mode);
++        break;
++    }
++
++#if DUMP_PRED
++    printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
++    printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
++    dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
++#endif
++#endif
++}
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
++                                  const uint8_t *_left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int size = 1 << trafo_size;
++    for (y = 0; y < size; y++)
++        for (x = 0; x < size; x++)
++            POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size]  +
++                         (size - 1 - y) * top[x]  + (y + 1) * left[size] + size) >> (trafo_size + 1);
++}
++#else
++static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
++                                  const uint8_t * _left, ptrdiff_t stride,
++                                  int trafo_size)
++{
++    int x, y;
++    int size = 1 << trafo_size;
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
++
++    for (y = 0; y < size; y++, src += stride)
++    {
++        for (x = 0; x < size; x++)
++        {
++            src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0]  +
++                         (size - 1 - y) * top[x][0]  + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
++            src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1]  +
++                         (size - 1 - y) * top[x][1]  + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
++        }
++    }
++}
++#endif
++
++#define PRED_PLANAR(size)\
++static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_planar)(src, top, left, stride, size + 2);                        \
++}
++
++PRED_PLANAR(0)
++PRED_PLANAR(1)
++PRED_PLANAR(2)
++PRED_PLANAR(3)
++
++#undef PRED_PLANAR
++
++#if !PRED_C
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size)
++{
++    int i, j, x, y;
++    int size          = (1 << log2_size);
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++    int dc            = size;
++    pixel4 a;
++    for (i = 0; i < size; i++)
++        dc += left[i] + top[i];
++
++    dc >>= log2_size + 1;
++
++    a = PIXEL_SPLAT_X4(dc);
++
++    for (i = 0; i < size; i++)
++        for (j = 0; j < size; j+=4)
++            AV_WN4P(&POS(j, i), a);
++
++//    if (c_idx == 0 && size < 32)
++// As we now have separate fns for y & c - no need to test that
++    if (size < 32)
++    {
++        POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
++        for (x = 1; x < size; x++)
++            POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
++        for (y = 1; y < size; y++)
++            POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
++    }
++}
++#else
++static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
++                          const uint8_t *_left,
++                          ptrdiff_t stride, int log2_size)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const c_src_ptr_t top = (c_src_ptr_t)_top;
++    const c_src_ptr_t left = (c_src_ptr_t)_left;
++    unsigned int dc0 = size;
++    unsigned int dc1 = size;
++
++    for (i = 0; i < size; i++)
++    {
++        dc0 += left[i][0] + top[i][0];
++        dc1 += left[i][1] + top[i][1];
++    }
++
++    dc0 >>= log2_size + 1;
++    dc1 >>= log2_size + 1;
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = dc0;
++            src[j][1] = dc1;
++
++        }
++    }
++}
++#endif
++
++#define PRED_DC(size)\
++static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top,        \
++                                       const uint8_t *left, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_dc)(src, top, left, stride, size + 2);                        \
++}
++
++PRED_DC(0)
++PRED_DC(1)
++PRED_DC(2)
++PRED_DC(3)
++
++#undef PRED_DC
++
++
++
++
++#if !PRED_C
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++    int i, j;
++    int size          = (1 << log2_size);
++    pixel *src        = (pixel *)_src;
++    pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
++
++    for (i = 0; i < size; i++)
++        for (j = 0; j < size; j+=4)
++            AV_WN4P(&POS(j, i), a);
++}
++#else
++static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
++{
++    unsigned int i, j;
++    const unsigned int size = (1 << log2_size);
++    c_dst_ptr_t src = (c_dst_ptr_t)_src;
++    const pixel a = (1 << (BIT_DEPTH - 1));
++
++    for (i = 0; i < size; i++, src += stride)
++    {
++        for (j = 0; j < size; ++j)
++        {
++            src[j][0] = a;
++            src[j][1] = a;
++        }
++    }
++}
++#endif
++
++#define PRED_DC0(size)\
++static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride)   \
++{                                                                               \
++    FUNC(pred_dc0)(src, stride, size + 2);                        \
++}
++
++PRED_DC0(0)
++PRED_DC0(1)
++PRED_DC0(2)
++PRED_DC0(3)
++
++#undef PRED_DC0
++
++
++
++
++#ifndef ANGLE_CONSTS
++#define ANGLE_CONSTS
++static const int intra_pred_angle[] = {
++     32,  26,  21,  17, 13,  9,  5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
++    -26, -21, -17, -13, -9, -5, -2, 0, 2,  5,  9, 13,  17,  21,  26,  32
++};
++static const int inv_angle[] = {
++    -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
++    -630, -910, -1638, -4096
++};
++#endif
++
++#if !PRED_C
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride,
++                                                int mode, int size)
++{
++    int x, y;
++    pixel *src        = (pixel *)_src;
++    const pixel *top  = (const pixel *)_top;
++    const pixel *left = (const pixel *)_left;
++
++    int angle = intra_pred_angle[mode - 2];
++    pixel ref_array[3 * MAX_TB_SIZE + 4];
++    pixel *ref_tmp = ref_array + size;
++    const pixel *ref;
++    int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++
++        if (angle < 0)
++        {
++            memcpy(ref_tmp + 1, top, size * PW);
++            ref_tmp[0] = left[-1];
++
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
++
++        for (y = 0; y < size; y++) {
++            int idx  = ((y + 1) * angle) >> 5;
++            int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; x += 4) {
++                    POS(x    , y) = ((32 - fact) * ref[x + idx + 1] +
++                                           fact  * ref[x + idx + 2] + 16) >> 5;
++                    POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
++                                           fact  * ref[x + 1 + idx + 2] + 16) >> 5;
++                    POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
++                                           fact  * ref[x + 2 + idx + 2] + 16) >> 5;
++                    POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
++                                           fact  * ref[x + 3 + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (x = 0; x < size; x += 4)
++                    AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
++            }
++        }
++        if (mode == 26 && size < 32) {
++            for (y = 0; y < size; y++)
++                POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
++        }
++
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            for (x = 0; x <= size; x += 4)
++                AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
++            // Inv angle <= -256 so top offset >= 0
++            for (x = last; x <= -1; x++)
++                ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
++            ref = ref_tmp;
++        }
++
++        for (x = 0; x < size; x++) {
++            int idx  = ((x + 1) * angle) >> 5;
++            int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
++                                       fact  * ref[y + idx + 2] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                    POS(x, y) = ref[y + idx + 1];
++            }
++        }
++        if (mode == 10 && size < 32) {
++            for (x = 0; x < size; x += 4) {
++                POS(x,     0) = av_clip_pixel(left[0] + ((top[x    ] - left[-1]) >> 1));
++                POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - left[-1]) >> 1));
++                POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - left[-1]) >> 1));
++                POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - left[-1]) >> 1));
++            }
++        }
++    }
++}
++#else
++static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
++                                                const uint8_t *_top,
++                                                const uint8_t *_left,
++                                                ptrdiff_t stride,
++                                                int mode, int size)
++{
++    int x, y;
++    c_dst_ptr_t src  = (c_dst_ptr_t)_src;
++    c_src_ptr_t top  = (c_src_ptr_t)_top;
++    c_src_ptr_t left = (c_src_ptr_t)_left;
++
++    const int angle = intra_pred_angle[mode - 2];
++    cpel ref_array[3 * MAX_TB_SIZE + 4][2];
++    c_dst_ptr_t ref_tmp = ref_array + size;
++    c_src_ptr_t ref;
++    const int last = (size * angle) >> 5;
++
++    if (mode >= 18) {
++        ref = top - 1;
++        if (angle < 0) {
++            memcpy(ref_tmp + 1, top, size * 2 * PW);
++            ref_tmp[0][0] = left[-1][0];
++            ref_tmp[0][1] = left[-1][1];
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
++
++        for (y = 0; y < size; y++, src += stride) {
++            const int idx  = ((y + 1) * angle) >> 5;
++            const int fact = ((y + 1) * angle) & 31;
++            if (fact) {
++                for (x = 0; x < size; ++x) {
++                    src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
++                                       fact  * ref[x + idx + 2][0] + 16) >> 5;
++                    src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
++                                       fact  * ref[x + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                memcpy(src, ref + idx + 1, size * 2 * PW);
++            }
++        }
++    } else {
++        ref = left - 1;
++        if (angle < 0 && last < -1) {
++            memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
++            for (x = last; x <= -1; x++)
++            {
++                ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
++                ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
++            }
++            ref = (c_src_ptr_t)ref_tmp;
++        }
++
++        for (x = 0; x < size; x++, src++) {
++            const int idx  = ((x + 1) * angle) >> 5;
++            const int fact = ((x + 1) * angle) & 31;
++            if (fact) {
++                for (y = 0; y < size; y++) {
++                    src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
++                                       fact  * ref[y + idx + 2][0] + 16) >> 5;
++                    src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
++                                       fact  * ref[y + idx + 2][1] + 16) >> 5;
++                }
++            } else {
++                for (y = 0; y < size; y++)
++                {
++                    src[y * stride][0] = ref[y + idx + 1][0];
++                    src[y * stride][1] = ref[y + idx + 1][1];
++                }
++            }
++        }
++    }
++}
++#endif
++
++static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
++}
++
++static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
++}
++
++static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
++}
++
++static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
++                                 const uint8_t *left,
++                                 ptrdiff_t stride, int mode)
++{
++    FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
++}
++
++#undef cpel
++#undef c_src_ptr_t
++#undef c_dst_ptr_t
++
++#undef EXTEND
++#undef POS
++#undef PW
++
++#undef filter_light1
++#undef filter_light
++#undef filter_strong
++#undef ref_gen
++
++#ifndef INCLUDED_ONCE
++#define INCLUDED_ONCE
++#endif
++
+diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
+new file mode 100644
+index 0000000000..98a0b104b7
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.c
+@@ -0,0 +1,155 @@
++/*
++Copyright (c) 2012, Broadcom Europe Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++*/
++
++#include <stdio.h>
++#include <string.h>
++#include <stdlib.h>
++#include <fcntl.h>
++#include <unistd.h>
++#include <assert.h>
++#include <stdint.h>
++#include <sys/ioctl.h>
++
++#include <linux/ioctl.h>
++
++#define MAJOR_NUM 100
++#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
++#define DEVICE_FILE_NAME "/dev/vcio"
++
++#include "rpi_mailbox.h"
++//#include <interface/vctypes/vc_image_structs.h>
++
++/*
++ * use ioctl to send mbox property message
++ */
++
++static int mbox_property(int file_desc, void *buf)
++{
++   int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
++
++   if (ret_val < 0) {
++      printf("ioctl_set_msg failed:%d\n", ret_val);
++   }
++
++#ifdef DEBUG
++   unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
++   for (i=0; i<size/4; i++)
++      printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
++#endif
++   return ret_val;
++}
++
++#define GET_VCIMAGE_PARAMS 0x30044
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img)
++{
++    uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
++    uint32_t * p = buf;
++    void * rimg;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = GET_VCIMAGE_PARAMS;
++    *p++ = sizeof(*img);
++    *p++ = sizeof(*img);
++    rimg = p;
++    memcpy(p, img, sizeof(*img));
++    p += sizeof(*img) / sizeof(*p);
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(fd, buf);
++    memcpy(img, rimg, sizeof(*img));
++
++    return rv;
++}
++
++
++#define SET_CLOCK_RATE 0x00038002
++#define GET_MAX_CLOCK 0x00030004
++#define CLOCK_HEVC 11
++
++static int mbox_property_generic(int fd, unsigned command, unsigned *word0, unsigned *word1)
++{
++    uint32_t buf[32];
++    uint32_t * p = buf;
++    int rv;
++
++    *p++ = 0; // size
++    *p++ = 0; // process request
++    *p++ = command;
++    *p++ = 8;
++    *p++ = 8;
++    *p++ = *word0;
++    *p++ = *word1;
++    *p++ = 0;  // End tag
++    buf[0] = (p - buf) * sizeof(*p);
++
++    rv = mbox_property(fd, buf);
++    *word0 = buf[6];
++    *word1 = buf[7];
++    return rv;
++}
++
++int mbox_open() {
++   int file_desc;
++
++   // open a char device file used for communicating with kernel mbox driver
++   file_desc = open(DEVICE_FILE_NAME, 0);
++   if (file_desc < 0) {
++      printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
++      printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
++   }
++   return file_desc;
++}
++
++void mbox_close(int file_desc) {
++  close(file_desc);
++}
++
++int mbox_request_clock(int fd) {
++   int rv;
++   unsigned word0, word1 = 0;
++   word0 = CLOCK_HEVC;
++   rv = mbox_property_generic(fd, GET_MAX_CLOCK, &word0, &word1);
++   if (rv != 0)
++      return rv;
++   word1 = word0;
++   word0 = CLOCK_HEVC;
++   rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++   return rv;
++}
++
++int mbox_release_clock(int fd) {
++  int rv;
++  unsigned word0, word1 = 0;
++  word0 = CLOCK_HEVC;
++  word1 = 0;
++  rv = mbox_property_generic(fd, SET_CLOCK_RATE, &word0, &word1);
++  return rv;
++}
+diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
+new file mode 100644
+index 0000000000..b2654ef01e
+--- /dev/null
++++ b/libavcodec/rpi_mailbox.h
+@@ -0,0 +1,58 @@
++#ifndef RPI_MAILBOX_H
++#define RPI_MAILBOX_H
++
++/* The image structure. */
++typedef struct vc_image_extra_uv_s {
++  void *u, *v;
++  int vpitch;
++} VC_IMAGE_EXTRA_UV_T;
++
++typedef union {
++    VC_IMAGE_EXTRA_UV_T uv;
++//  VC_IMAGE_EXTRA_RGBA_T rgba;
++//  VC_IMAGE_EXTRA_PAL_T pal;
++//  VC_IMAGE_EXTRA_TF_T tf;
++//  VC_IMAGE_EXTRA_BAYER_T bayer;
++//  VC_IMAGE_EXTRA_MSBAYER_T msbayer;
++//  VC_IMAGE_EXTRA_CODEC_T codec;
++//  VC_IMAGE_EXTRA_OPENGL_T opengl;
++} VC_IMAGE_EXTRA_T;
++
++
++typedef struct VC_IMAGE_T {
++  unsigned short                  type;           /* should restrict to 16 bits */
++  unsigned short                  info;           /* format-specific info; zero for VC02 behaviour */
++  unsigned short                  width;          /* width in pixels */
++  unsigned short                  height;         /* height in pixels */
++  int                             pitch;          /* pitch of image_data array in bytes */
++  int                             size;           /* number of bytes available in image_data array */
++  void                           *image_data;     /* pixel data */
++  VC_IMAGE_EXTRA_T                extra;          /* extra data like palette pointer */
++  void                           *metadata;       /* metadata header for the image */
++  void                           *pool_object;    /* nonNULL if image was allocated from a vc_pool */
++  int                             mem_handle;     /* the mem handle for relocatable memory storage */
++  int                             metadata_size;  /* size of metadata of each channel in bytes */
++  int                             channel_offset; /* offset of consecutive channels in bytes */
++  uint32_t                        video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
++  uint8_t                         num_channels;   /* number of channels (2 for stereo) */
++  uint8_t                         current_channel;/* the channel this header is currently pointing to */
++  uint8_t                         linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
++  uint8_t                         is_channel_linked;     /* Track if the above structure is been used to link the header
++                                                            into a linked-mulitchannel image */
++  uint8_t                         channel_index;         /* index of the channel this header represents while
++                                                            it is being linked. */
++  uint8_t                         _dummy[3];      /* pad struct to 64 bytes */
++} VC_IMAGE_T;
++
++typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
++
++
++extern int mbox_open(void);
++extern void mbox_close(int file_desc);
++
++int mbox_get_image_params(int fd, VC_IMAGE_T * img);
++
++int mbox_request_clock(int fd);
++int mbox_release_clock(int fd);
++
++#endif
+diff --git a/libavcodec/rpi_mem.c b/libavcodec/rpi_mem.c
+new file mode 100644
+index 0000000000..812921f665
+--- /dev/null
++++ b/libavcodec/rpi_mem.c
+@@ -0,0 +1,326 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++
++#define OPT_PREFER_CMA 0
++
++struct rpi_cache_flush_env_s {
++  struct vcsm_user_clean_invalid2_s v;
++};
++
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++    if (p->arm != NULL)
++        vcsm_unlock_ptr(p->arm);
++    if (p->vcsm_handle != 0)
++        vcsm_free(p->vcsm_handle);
++    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++    const int numbytes, const unsigned int cache_type, const char * const name)
++{
++    memset(p, 0, sizeof(*p));
++    p->numbytes = (numbytes + 255) & ~255;  // Round up
++
++    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to alloc %d bytes from VCSM for %s\n", p->numbytes, name);
++        goto fail;
++    }
++    if ((p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to VC handle from VCSM for %s\n", name);
++        goto fail;
++    }
++    if ((p->arm = vcsm_lock(p->vcsm_handle)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to lock handle from VCSM for %s\n", name);
++        goto fail;
++    }
++    if ((p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "Unable to get VC addr from VCSM for %s\n", name);
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    gpu_free_internal(p);
++    return AVERROR(ENOMEM);
++}
++
++// Public gpu fns
++
++// Allocate memory on GPU
++// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
++// Returns 0 on success.
++// This allocates memory that will not be cached in ARM's data cache.
++// Therefore safe to use without data cache flushing.
++int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
++{
++    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
++}
++
++// This allocates data that will be
++//    Cached in ARM L2
++//    Uncached in VPU L2
++int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
++{
++    return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
++}
++
++void gpu_free(GPU_MEM_PTR_T * const p) {
++    gpu_free_internal(p);
++}
++
++void rpi_mem_gpu_uninit(void)
++{
++    vcsm_exit();
++    bcm_host_deinit();
++}
++
++int rpi_mem_gpu_init(const unsigned int flags)
++{
++    const int wants_cma = bcm_host_is_fkms_active();
++    int use_cma;
++
++    (void)flags;
++
++    if (vcsm_init_ex(wants_cma ? 1 : 0, -1) == 0)
++        use_cma = 1;
++    else if (vcsm_init_ex(wants_cma ? 0 : 1, -1) == 0)
++        use_cma = 0;
++    else
++        return AVERROR(EINVAL);
++
++    bcm_host_init();
++
++    return use_cma + 1;
++}
++
++// ----------------------------------------------------------------------------
++//
++// Cache flush functions
++
++#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
++{
++  rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
++  *rfe = (rpi_cache_flush_env_t){.v={.op_count = 0}};
++  return rfe;
++}
++
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
++{
++  // Nothing needed
++}
++
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
++{
++    int rc = 0;
++    if (rfe->v.op_count != 0) {
++        if (vcsm_clean_invalid2(&rfe->v) != 0)
++        {
++          const int err = errno;
++          av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", err);
++          rc = AVERROR(err);
++        }
++        rfe->v.op_count = 0;
++    }
++    return rc;
++}
++
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
++{
++  int rc = rpi_cache_flush_execute(rfe);;
++
++  return rc;
++}
++
++inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
++{
++  struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++
++  av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
++
++  b->invalidate_mode = mode;
++  b->block_count = blocks;
++  b->start_address = gm->arm + offset0;
++  b->block_size = block_size;
++  b->inter_block_stride = block_stride;
++}
++
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset, const unsigned int size)
++{
++  // Deal with empty pointer trivially
++  if (gm == NULL || size == 0)
++    return;
++
++  av_assert1(offset <= gm->numbytes);
++  av_assert1(size <= gm->numbytes);
++  av_assert1(offset + size <= gm->numbytes);
++
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
++}
++
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
++{
++  rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
++}
++
++
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
++{
++#if !RPI_ONE_BUF
++#error Fixme! (NIF)
++#endif
++  if (gpu_is_buf1(frame)) {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
++  }
++  else
++  {
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
++    rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
++  }
++}
++
++// Flush an area of a frame
++// Width, height, x0, y0 in luma pels
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma)
++{
++  const unsigned int y_offset = frame->linesize[0] * y0;
++  const unsigned int y_size = frame->linesize[0] * height;
++  // Round UV up/down to get everything
++  const unsigned int uv_rnd = (1U << uv_shift) >> 1;
++  const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
++  const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
++
++#if 0
++  // *** frame->height is cropped height so not good
++  // As all unsigned they will also reject -ve
++  // Test individually as well as added to reject overflow
++  av_assert0(start_line <= (unsigned int)frame->height);  // ***** frame height cropped
++  av_assert0(n <= (unsigned int)frame->height);
++  av_assert0(start_line + n <= (unsigned int)frame->height);
++#endif
++
++  if (!gpu_is_buf1(frame))
++  {
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
++    }
++  }
++  else if (!av_rpi_is_sand_frame(frame))
++  {
++    const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
++    if (do_luma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
++    }
++    if (do_chroma) {
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
++      rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
++    }
++  }
++  else
++  {
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
++    const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
++    const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1;  // Same for Y & C
++    av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
++
++    if (do_chroma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
++      b->block_size = uv_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++    if (do_luma)
++    {
++      struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
++      b->invalidate_mode = mode;
++      b->block_count = block_count;
++      b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
++      b->block_size = y_size;
++      b->inter_block_stride = stride1 * stride2;
++    }
++  }
++}
++
++// Call this to clean and invalidate a region of memory
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
++{
++  rpi_cache_buf_t cbuf;
++  rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
++  rpi_cache_flush_add_gm_ptr(rfe, p, mode);
++  rpi_cache_flush_finish(rfe);
++}
++
+diff --git a/libavcodec/rpi_mem.h b/libavcodec/rpi_mem.h
+new file mode 100644
+index 0000000000..a451079806
+--- /dev/null
++++ b/libavcodec/rpi_mem.h
+@@ -0,0 +1,88 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_MEM_H
++#define RPI_MEM_H
++
++typedef struct gpu_mem_ptr_s {
++  unsigned char *arm; // Pointer to memory mapped on ARM side
++  int vc_handle;   // Videocore handle of relocatable memory
++  int vcsm_handle; // Handle for use by VCSM
++  int vc;       // Address for use in GPU code
++  int numbytes; // Size of memory block
++} GPU_MEM_PTR_T;
++
++// General GPU functions
++
++#define GPU_INIT_GPU 1
++#define GPU_INIT_CMA 2
++
++extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
++extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
++extern void gpu_free(GPU_MEM_PTR_T * const p);
++int rpi_mem_gpu_init(const unsigned int flags);
++void rpi_mem_gpu_uninit(void);
++
++// Cache flush stuff
++
++struct rpi_cache_flush_env_s;
++typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
++
++typedef struct {uint32_t t[33];} rpi_cache_buf_t;
++
++rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
++// Free env without flushing
++void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & clear but do not free the env
++int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
++// Do the accumulated flush & free the env
++int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
++
++typedef enum
++{
++    RPI_CACHE_FLUSH_MODE_INVALIDATE     = 1,
++    RPI_CACHE_FLUSH_MODE_WRITEBACK      = 2,
++    RPI_CACHE_FLUSH_MODE_WB_INVALIDATE  = 3
++} rpi_cache_flush_mode_t;
++
++struct AVFrame;
++void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
++  const unsigned int offset, const unsigned int size);
++void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
++  const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
++void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode);
++void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const struct AVFrame * const frame, const rpi_cache_flush_mode_t mode,
++  const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
++  const unsigned int uv_shift, const int do_luma, const int do_chroma);
++
++// init, add, finish for one gm ptr
++void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
++
++#endif
+diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
+new file mode 100644
+index 0000000000..cb7b96119e
+--- /dev/null
++++ b/libavcodec/rpi_qpu.c
+@@ -0,0 +1,776 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#include <stddef.h>
++#include <stdint.h>
++#include "libavutil/avassert.h"
++
++#include "config.h"
++
++#include <pthread.h>
++#include <time.h>
++
++#include <interface/vcsm/user-vcsm.h>
++
++#include "rpi_mailbox.h"
++#include "rpi_mem.h"
++#include "rpi_qpu.h"
++#include "rpi_hevc_shader.h"
++#include "rpi_hevc_transform8.h"
++#include "rpi_hevc_transform10.h"
++#include "libavutil/rpi_sand_fns.h"
++
++// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
++#define RPI_TRACE_TIME_VPU_QPU_WAIT     0
++
++// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
++// Beware this is expensive and will probably throw off all other timing by >10%
++#define RPI_TRACE_QPU_PROFILE_ALL       0
++
++// QPU "noflush" flags
++// a mixture of flushing & profiling
++
++#define QPU_FLAGS_NO_FLUSH_VPU          1       // If unset VPU cache will be flushed
++#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2       // Clear & Enable detailed QPU profiling registers
++#define QPU_FLAGS_PROF_OUTPUT_COUNTS    4       // Print the results
++#define QPU_FLAGS_OUTPUT_QPU_TIMES      8       // Print QPU times - independant of the profiling
++#define QPU_FLAGS_NO_FLUSH_QPU          16      // If unset flush QPU caches & TMUs (uniforms always flushed)
++
++#define vcos_verify_ge0(x) ((x)>=0)
++
++// Size in 32bit words
++#define QPU_CODE_SIZE 4098
++#define VPU_CODE_SIZE 16384
++
++static const short rpi_transMatrix2even[32][16] = { // Even rows first
++{64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64},
++{90,  87,  80,  70,  57,  43,  25,   9,  -9, -25, -43, -57, -70, -80, -87, -90},
++{89,  75,  50,  18, -18, -50, -75, -89, -89, -75, -50, -18,  18,  50,  75,  89},
++{87,  57,   9, -43, -80, -90, -70, -25,  25,  70,  90,  80,  43,  -9, -57, -87},
++{83,  36, -36, -83, -83, -36,  36,  83,  83,  36, -36, -83, -83, -36,  36,  83},
++{80,   9, -70, -87, -25,  57,  90,  43, -43, -90, -57,  25,  87,  70,  -9, -80},
++{75, -18, -89, -50,  50,  89,  18, -75, -75,  18,  89,  50, -50, -89, -18,  75},
++{70, -43, -87,   9,  90,  25, -80, -57,  57,  80, -25, -90,  -9,  87,  43, -70},
++{64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64,  64, -64, -64,  64},
++{57, -80, -25,  90,  -9, -87,  43,  70, -70, -43,  87,   9, -90,  25,  80, -57},
++{50, -89,  18,  75, -75, -18,  89, -50, -50,  89, -18, -75,  75,  18, -89,  50},
++{43, -90,  57,  25, -87,  70,   9, -80,  80,  -9, -70,  87, -25, -57,  90, -43},
++{36, -83,  83, -36, -36,  83, -83,  36,  36, -83,  83, -36, -36,  83, -83,  36},
++{25, -70,  90, -80,  43,   9, -57,  87, -87,  57,  -9, -43,  80, -90,  70, -25},
++{18, -50,  75, -89,  89, -75,  50, -18, -18,  50, -75,  89, -89,  75, -50,  18},
++{ 9, -25,  43, -57,  70, -80,  87, -90,  90, -87,  80, -70,  57, -43,  25,  -9},
++// Odd rows
++{90,  90,  88,  85,  82,  78,  73,  67,  61,  54,  46,  38,  31,  22,  13,   4},
++{90,  82,  67,  46,  22,  -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
++{88,  67,  31, -13, -54, -82, -90, -78, -46,  -4,  38,  73,  90,  85,  61,  22},
++{85,  46, -13, -67, -90, -73, -22,  38,  82,  88,  54,  -4, -61, -90, -78, -31},
++{82,  22, -54, -90, -61,  13,  78,  85,  31, -46, -90, -67,   4,  73,  88,  38},
++{78,  -4, -82, -73,  13,  85,  67, -22, -88, -61,  31,  90,  54, -38, -90, -46},
++{73, -31, -90, -22,  78,  67, -38, -90, -13,  82,  61, -46, -88,  -4,  85,  54},
++{67, -54, -78,  38,  85, -22, -90,   4,  90,  13, -88, -31,  82,  46, -73, -61},
++{61, -73, -46,  82,  31, -88, -13,  90,  -4, -90,  22,  85, -38, -78,  54,  67},
++{54, -85,  -4,  88, -46, -61,  82,  13, -90,  38,  67, -78, -22,  90, -31, -73},
++{46, -90,  38,  54, -90,  31,  61, -88,  22,  67, -85,  13,  73, -82,   4,  78},
++{38, -88,  73,  -4, -67,  90, -46, -31,  85, -78,  13,  61, -90,  54,  22, -82},
++{31, -78,  90, -61,   4,  54, -88,  82, -38, -22,  73, -90,  67, -13, -46,  85},
++{22, -61,  85, -90,  73, -38,  -4,  46, -78,  90, -82,  54, -13, -31,  67, -88},
++{13, -38,  61, -78,  88, -90,  85, -73,  54, -31,   4,  22, -46,  67, -82,  90},
++{ 4, -13,  22, -31,  38, -46,  54, -61,  67, -73,  78, -82,  85, -88,  90, -90}
++};
++
++// Code/constants on GPU
++struct GPU
++{
++//  unsigned int qpu_code[QPU_CODE_SIZE];
++    unsigned int vpu_code8[VPU_CODE_SIZE];
++    unsigned int vpu_code10[VPU_CODE_SIZE];
++    short transMatrix2even[16*16*2];
++};
++
++#define WAIT_COUNT_MAX 16
++
++typedef struct trace_time_one_s
++{
++    int count;
++    int64_t start[WAIT_COUNT_MAX];
++    int64_t total[WAIT_COUNT_MAX];
++} trace_time_one_t;
++
++typedef struct trace_time_wait_s
++{
++    unsigned int jcount;
++    int64_t start0;
++    int64_t last_update;
++    trace_time_one_t active;
++    trace_time_one_t wait;
++} trace_time_wait_t;
++
++typedef struct vq_wait_s
++{
++    sem_t sem;
++    struct vq_wait_s * next;
++} vq_wait_t;
++
++#define VQ_WAIT_POOL_SIZE 16
++typedef struct vq_wait_pool_s
++{
++    vq_wait_t * head;
++    vq_wait_t pool[VQ_WAIT_POOL_SIZE];
++} vq_wait_pool_t;
++
++static void vq_wait_pool_init(vq_wait_pool_t * const pool);
++static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
++
++typedef struct gpu_env_s
++{
++    int open_count;
++    int init_count;
++    int vpu_i_cache_flushed;
++    GPU_MEM_PTR_T qpu_code_gm_ptr;
++    GPU_MEM_PTR_T code_gm_ptr;
++    GPU_MEM_PTR_T dummy_gm_ptr;
++    vq_wait_pool_t wait_pool;
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    trace_time_wait_t ttw;
++#endif
++} gpu_env_t;
++
++// Stop more than one thread trying to allocate memory or use the processing resources at once
++static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
++static gpu_env_t * gpu = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++
++static int64_t ns_time(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
++}
++
++
++#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
++
++#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
++#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
++#define T_ARG(t) T_SEC(t), T_MS(t)
++#define T_FMT "%u.%03u"
++
++static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
++{
++    // Update totals for levels that are still pending
++    for (int i = 0; i < tto->count; ++i) {
++        tto->total[i] += now - tto->start[i];
++        tto->start[i] = now;
++    }
++
++    printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
++         prefix,
++         T_ARG(now - start0 - tto->total[0]),
++         T_ARG(tto->total[0]),
++         T_ARG(tto->total[1]),
++         T_ARG(tto->total[2]),
++         T_ARG(tto->total[3]));
++}
++
++
++static void tto_start(trace_time_one_t * const tto, const int64_t now)
++{
++    av_assert0(tto->count < WAIT_COUNT_MAX);
++    tto->start[tto->count++] = now;
++}
++
++static void tto_end(trace_time_one_t * const tto, const int64_t now)
++{
++    const int n = --tto->count;
++    av_assert0(n >= 0);
++    tto->total[n] += now - tto->start[n];
++}
++
++static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
++{
++    printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
++    tto_print(&ttw->active, now, ttw->start0, "Active");
++    tto_print(&ttw->wait,   now, ttw->start0, "  Wait");
++}
++
++#endif
++
++// GPU memory alloc fns (internal)
++
++static void gpu_free_internal(GPU_MEM_PTR_T * const p)
++{
++    if (p->arm != NULL)
++        vcsm_unlock_ptr(p->arm);
++    if (p->vcsm_handle != 0)
++        vcsm_free(p->vcsm_handle);
++    memset(p, 0, sizeof(*p));  // Ensure we crash hard if we try and use this again
++}
++
++
++static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
++    const int numbytes, const unsigned int cache_type, const char * const name)
++{
++    memset(p, 0, sizeof(*p));
++    p->numbytes = (numbytes + 255) & ~255;  // Round up
++
++    if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
++        (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
++        (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
++        (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
++    {
++        gpu_free_internal(p);
++        return AVERROR(ENOMEM);
++    }
++    return 0;
++}
++
++
++// GPU init, free, lock, unlock
++
++static void gpu_term(void)
++{
++    gpu_env_t * const ge = gpu;
++
++    // We have to hope that eveything has terminated...
++    gpu = NULL;
++
++    vc_gpuserv_deinit();
++
++    gpu_free_internal(&ge->code_gm_ptr);
++    gpu_free_internal(&ge->qpu_code_gm_ptr);
++    gpu_free_internal(&ge->dummy_gm_ptr);
++
++    vcsm_exit();
++
++    vq_wait_pool_deinit(&ge->wait_pool);
++
++    free(ge);
++}
++
++
++// Connect to QPU, returns 0 on success.
++static int gpu_init(gpu_env_t ** const gpu) {
++    volatile struct GPU* ptr;
++    gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
++    int rv;
++    *gpu = NULL;
++
++    if (ge == NULL)
++        return -1;
++
++    vq_wait_pool_init(&ge->wait_pool);
++
++    vcsm_init();
++
++    // Now copy over the QPU code into GPU memory
++    if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
++      return rv;
++
++    {
++        int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
++        av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
++        memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
++    }
++
++    // And the VPU code
++    if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
++        return rv;
++    ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
++
++    // Zero everything so we have zeros between the code bits
++    memset((void *)ptr, 0, sizeof(*ptr));
++    {
++        int num_bytes = sizeof(rpi_hevc_transform8);
++        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
++    }
++    {
++        int num_bytes = sizeof(rpi_hevc_transform10);
++        av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
++        memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
++    }
++    // And the transform coefficients
++    memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
++
++    // Generate a dummy "frame" & fill with 0x80
++    // * Could reset to 1 <<bit_depth?
++    if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
++        return rv;
++    memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
++
++    *gpu = ge;
++    return 0;
++}
++
++
++
++static void gpu_unlock(void) {
++    pthread_mutex_unlock(&gpu_mutex);
++}
++
++// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
++static gpu_env_t * gpu_lock(void) {
++    pthread_mutex_lock(&gpu_mutex);
++
++    av_assert1(gpu != NULL);
++    return gpu;
++}
++
++static gpu_env_t * gpu_lock_ref(void)
++{
++    pthread_mutex_lock(&gpu_mutex);
++
++    if (gpu == NULL) {
++        int rv = gpu_init(&gpu);
++        if (rv != 0) {
++            gpu_unlock();
++            return NULL;
++        }
++    }
++
++    ++gpu->open_count;
++    return gpu;
++}
++
++static void gpu_unlock_unref(gpu_env_t * const ge)
++{
++    if (--ge->open_count == 0)
++        gpu_term();
++
++    gpu_unlock();
++}
++
++static inline gpu_env_t * gpu_ptr(void)
++{
++    av_assert1(gpu != NULL);
++    return gpu;
++}
++
++unsigned int vpu_get_fn(const unsigned int bit_depth) {
++  uint32_t a = 0;
++
++  // Make sure that the gpu is initialized
++  av_assert1(gpu != NULL);
++  switch (bit_depth){
++    case 8:
++      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
++      break;
++    case 10:
++      a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
++      break;
++    default:
++      av_assert0(0);
++  }
++  return a;
++}
++
++unsigned int vpu_get_constants(void) {
++  av_assert1(gpu != NULL);
++  return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
++}
++
++void gpu_ref(void)
++{
++  gpu_lock_ref();
++  gpu_unlock();
++}
++
++void gpu_unref(void)
++{
++  gpu_env_t * const ge = gpu_lock();
++  gpu_unlock_unref(ge);
++}
++
++// ----------------------------------------------------------------------------
++
++
++// Wait abstractions - mostly so we can easily add profile code
++static void vq_wait_pool_init(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_init(&wp->pool[i].sem, 0, 0);
++    wp->pool[i].next = wp->pool + i + 1;
++  }
++  wp->head = wp->pool + 0;
++  wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
++}
++
++static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
++{
++  unsigned int i;
++  wp->head = NULL;
++  for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
++    sem_destroy(&wp->pool[i].sem);
++    wp->pool[i].next = NULL;
++  }
++}
++
++
++// If sem_init actually takes time then maybe we want a pool...
++static vq_wait_t * vq_wait_new(void)
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  vq_wait_t * const wait = ge->wait_pool.head;
++  ge->wait_pool.head = wait->next;
++  wait->next = NULL;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  tto_start(&ge->ttw.active, ns_time());
++#endif
++
++  gpu_unlock();
++  return wait;
++}
++
++static void vq_wait_delete(vq_wait_t * const wait)
++{
++  gpu_env_t * const ge = gpu_lock();
++  wait->next = ge->wait_pool.head;
++  ge->wait_pool.head = wait;
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    trace_time_wait_t * const ttw = &ge->ttw;
++    const int64_t now = ns_time();
++    ++ttw->jcount;
++    tto_end(&ttw->wait, now);
++
++    if (ttw->start0 == 0)
++    {
++      ttw->start0 = ttw->active.start[0];
++      ttw->last_update = ttw->start0;
++    }
++    if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
++    {
++      ttw->last_update += WAIT_TIME_PRINT_PERIOD;
++      ttw_print(ttw, now);
++    }
++  }
++#endif
++  gpu_unlock_unref(ge);
++}
++
++static void vq_wait_wait(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++      const int64_t now = ns_time();
++      gpu_env_t * const ge = gpu_lock();
++      tto_start(&ge->ttw.wait, now);
++      gpu_unlock();
++  }
++#endif
++
++  while (sem_wait(&wait->sem) == -1 && errno == EINTR)
++    /* loop */;
++}
++
++static void vq_wait_post(vq_wait_t * const wait)
++{
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++  {
++    gpu_env_t *const ge = gpu_lock();
++    tto_end(&ge->ttw.active, ns_time());
++    gpu_unlock();
++  }
++#endif
++
++  sem_post(&wait->sem);
++}
++
++
++
++// Header comments were wrong for these two
++#define VPU_QPU_MASK_QPU  1
++#define VPU_QPU_MASK_VPU  2
++
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
++{
++//  vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
++  vpu_qpu_job_env_t * vqj = buf;
++//  memset(vqj, 0, sizeof(*vqj));
++  vqj->n = 0;
++  vqj->mask = 0;
++  return vqj;
++}
++
++void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
++{
++//  memset(vqj, 0, sizeof(*vqj));
++//  free(vqj);
++}
++
++static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
++{
++  struct gpu_job_s * const j = vqj->j + vqj->n++;
++  av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
++  return j;
++}
++
++void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
++{
++  if (vpu_code != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_VPU;
++
++    j->command = EXECUTE_VPU;
++    j->callback.func = 0;
++    j->callback.cookie = NULL;
++    // The bottom two bits of the execute address contain no-flush flags
++    // b0 will flush the VPU I-cache if unset so we nearly always want that set
++    // as we never reload code
++    j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
++    j->u.v.q[1] = r0;
++    j->u.v.q[2] = r1;
++    j->u.v.q[3] = r2;
++    j->u.v.q[4] = r3;
++    j->u.v.q[5] = r4;
++    j->u.v.q[6] = r5;
++    gpu->vpu_i_cache_flushed = 1;
++  }
++}
++
++// flags are QPU_FLAGS_xxx
++void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
++{
++  if (n != 0) {
++    struct gpu_job_s *const j = new_job(vqj);
++    vqj->mask |= VPU_QPU_MASK_QPU;
++
++    j->command = EXECUTE_QPU;
++    j->callback.func = 0;
++    j->callback.cookie = NULL;
++
++    j->u.q.jobs = n;
++#if RPI_TRACE_QPU_PROFILE_ALL
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
++#else
++    j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
++#endif
++    j->u.q.timeout = 5000;
++    memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
++  }
++}
++
++// Convert callback to sem post
++static void vpu_qpu_job_callback_wait(void * v)
++{
++  vq_wait_post(v);
++}
++
++// Poke a user-supplied sem
++static void vpu_qpu_job_callback_sem(void * v)
++{
++  sem_post((sem_t *)v);
++}
++
++void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
++{
++  vq_wait_t * wait;
++
++  if (vqj->mask == 0) {
++    *wait_h = NULL;
++    return;
++  }
++
++  // We are going to want a sync object
++  wait = vq_wait_new();
++
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert1(j->callback.func == 0);
++
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
++
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_wait;
++    j->callback.cookie = wait;
++  }
++
++  vqj->mask = 0;
++  *wait_h = wait;
++}
++
++// Returns 0 if no sync added ('cos Q empty), 1 if sync added
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
++{
++  // If nothing on q then just return
++  if (vqj->mask == 0)
++    return 0;
++
++  // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
++  // If we only posted one thing or only QPU jobs
++  if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
++  {
++    struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
++    av_assert1(j->callback.func == 0);
++
++    j->callback.func = vpu_qpu_job_callback_sem;
++    j->callback.cookie = sem;
++  }
++  else
++  {
++    struct gpu_job_s *const j = new_job(vqj);
++
++    j->command = EXECUTE_SYNC;
++    j->u.s.mask = vqj->mask;
++    j->callback.func = vpu_qpu_job_callback_sem;
++    j->callback.cookie = sem;
++  }
++
++  vqj->mask = 0;
++  return 1;
++}
++
++
++int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
++{
++  if (vqj->n == 0)
++    return 0;
++
++  return vc_gpuserv_execute_code(vqj->n, vqj->j);
++}
++
++// Simple wrapper of start + delete
++int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
++{
++  int rv;
++  rv = vpu_qpu_job_start(vqj);
++  vpu_qpu_job_delete(vqj);
++  return rv;
++}
++
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
++{
++  if (wait_h != NULL)
++  {
++    vq_wait_t * const wait = *wait_h;
++    if (wait != NULL) {
++      *wait_h = NULL;
++      vq_wait_wait(wait);
++      vq_wait_delete(wait);
++    }
++  }
++}
++
++int vpu_qpu_init()
++{
++  gpu_env_t * const ge = gpu_lock_ref();
++  if (ge == NULL)
++    return -1;
++
++  if (ge->init_count++ == 0)
++  {
++    vc_gpuserv_init();
++  }
++
++  gpu_unlock();
++  return 0;
++}
++
++void vpu_qpu_term()
++{
++  gpu_env_t * const ge = gpu_lock();
++
++  if (--ge->init_count == 0) {
++    vc_gpuserv_deinit();
++
++#if RPI_TRACE_TIME_VPU_QPU_WAIT
++    ttw_print(&ge->ttw, ns_time());
++#endif
++  }
++
++  gpu_unlock_unref(ge);
++}
++
++uint32_t qpu_fn(const int * const mc_fn)
++{
++  return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
++}
++
++uint32_t qpu_dummy(void)
++{
++  return gpu->dummy_gm_ptr.vc;
++}
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
++{
++  // Dummy values we can catch with emulation
++  qf->y_pxx = ~1U;
++  qf->y_bxx = ~2U;
++  qf->y_p00 = ~3U;
++  qf->y_b00 = ~4U;
++  qf->c_pxx = ~5U;
++  qf->c_bxx = ~6U;
++
++  switch (bit_depth) {
++    case 8:
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_pxx = qpu_fn(mc_filter_y_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y_b00);
++      qf->c_pxx = qpu_fn(mc_filter_c_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c_b);
++      break;
++    case 10:
++      qf->c_pxx = qpu_fn(mc_filter_c10_p);
++      qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
++      qf->c_bxx = qpu_fn(mc_filter_c10_b);
++      qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
++      qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
++      qf->y_p00 = qpu_fn(mc_filter_y10_p00);
++      qf->y_b00 = qpu_fn(mc_filter_y10_b00);
++      break;
++    default:
++      return -1;
++  }
++  return 0;
++}
++
+diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
+new file mode 100644
+index 0000000000..8777687021
+--- /dev/null
++++ b/libavcodec/rpi_qpu.h
+@@ -0,0 +1,103 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_QPU_H
++#define RPI_QPU_H
++
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#pragma GCC diagnostic ignored "-Wstrict-prototypes"
++#include "interface/vmcs_host/vc_vchi_gpuserv.h"  // for gpu_job_s
++#pragma GCC diagnostic pop
++
++// QPU specific functions
++
++typedef struct HEVCRpiQpu {
++    uint32_t c_pxx;
++    uint32_t c_pxx_l1;
++    uint32_t c_bxx;
++    uint32_t y_pxx;
++    uint32_t y_bxx;
++    uint32_t y_p00;
++    uint32_t y_b00;
++} HEVCRpiQpu;
++
++int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
++
++uint32_t qpu_fn(const int * const mc_fn);
++uint32_t qpu_dummy(void);
++
++#define QPU_N_GRP    4
++#define QPU_N_MAX    12
++
++#define QPU_MAIL_EL_VALS  2
++
++struct vpu_qpu_wait_s;
++typedef struct vq_wait_s * vpu_qpu_wait_h;
++
++// VPU specific functions
++
++struct vpu_qpu_job_env_s;
++typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
++
++#define VPU_QPU_JOB_MAX 4
++struct vpu_qpu_job_env_s
++{
++  unsigned int n;
++  unsigned int mask;
++  struct gpu_job_s j[VPU_QPU_JOB_MAX];
++};
++typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
++
++vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
++void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
++void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
++  const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
++void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
++void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
++int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
++int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
++
++extern unsigned int vpu_get_fn(const unsigned int bit_depth);
++extern unsigned int vpu_get_constants(void);
++
++// Waits for previous post_codee to complete and Will null out *wait_h after use
++void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
++int vpu_qpu_init(void);
++void vpu_qpu_term(void);
++
++void gpu_ref(void);
++void gpu_unref(void);
++
++#endif
+diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
+new file mode 100644
+index 0000000000..37be9a0f49
+--- /dev/null
++++ b/libavcodec/rpi_zc.c
+@@ -0,0 +1,1227 @@
++#include "config.h"
++
++#include "libavcodec/avcodec.h"
++#include "rpi_mem.h"
++#include "rpi_mailbox.h"
++#include "rpi_zc.h"
++#include "libavutil/avassert.h"
++#include <pthread.h>
++
++#include "libavutil/buffer_internal.h"
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <interface/vctypes/vc_image_types.h>
++#include <interface/vcsm/user-vcsm.h>
++#pragma GCC diagnostic pop
++
++#define TRACE_ALLOC 0
++#define DEBUG_ALWAYS_KEEP_LOCKED 0
++
++struct ZcPoolEnt;
++
++typedef struct ZcPool
++{
++    size_t numbytes;
++    struct ZcPoolEnt * head;
++    pthread_mutex_t lock;
++} ZcPool;
++
++typedef struct ZcPoolEnt
++{
++    size_t numbytes;
++
++    unsigned int vcsm_handle;
++    unsigned int vc_handle;
++    void * map_arm;
++    unsigned int map_vc;
++
++    struct ZcPoolEnt * next;
++    struct ZcPool * pool;
++} ZcPoolEnt;
++
++typedef struct ZcOldCtxVals
++{
++    int thread_safe_callbacks;
++    int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
++    void * opaque;
++} ZcOldCtxVals;
++
++typedef struct AVZcEnv
++{
++    unsigned int refcount;
++    ZcOldCtxVals old;
++
++    void * pool_env;
++    av_rpi_zc_alloc_buf_fn_t * alloc_buf;
++    av_rpi_zc_free_pool_fn_t * free_pool;
++
++    unsigned int pool_size;
++} ZcEnv;
++
++typedef struct ZcUserBufEnv {
++    void * v;
++    const av_rpi_zc_buf_fn_tab_t * fn;
++    size_t numbytes;
++    int offset;
++} ZcUserBufEnv;
++
++#define ZC_BUF_INVALID  0
++#define ZC_BUF_VALID    1
++#define ZC_BUF_NEVER    2
++
++typedef struct ZcBufEnv {
++    GPU_MEM_PTR_T gmem;
++    AVZcEnvPtr zc;
++    int is_valid;
++    AVBufferRef * user;
++    AVRpiZcFrameGeometry geo;
++    size_t size_y;
++    size_t size_c;
++    size_t size_pic;
++    ssize_t offset;
++    pthread_mutex_t lock;
++    pthread_cond_t cond;
++} ZcBufEnv;
++
++
++
++
++
++
++#define ALLOC_PAD       0
++#define ALLOC_ROUND     0x1000
++#define STRIDE_ROUND    64
++#define STRIDE_OR       0
++
++#define DEBUG_ZAP0_BUFFERS 0
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16) ||
++        (format == AV_PIX_FMT_RPI4_8 || format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++//----------------------------------------------------------------------------
++//
++// Internal pool stuff
++
++// Pool entry functions
++
++static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const size_t req_size)
++{
++    ZcPoolEnt * const zp = av_mallocz(sizeof(ZcPoolEnt));
++
++    // Round up to 4k & add 4k
++    const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
++
++    if (zp == NULL) {
++        av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
++        goto fail0;
++    }
++
++    // The 0x80 here maps all pages here rather than waiting for lazy mapping
++    // BEWARE that in GPU land a later unlock/lock pair will put us back into
++    // lazy mode - which will also break cache invalidate calls.
++    if ((zp->vcsm_handle = vcsm_malloc_cache(alloc_size, VCSM_CACHE_TYPE_HOST | 0x80, "ffmpeg_rpi_zc")) == 0)
++    {
++        av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
++        goto fail1;
++    }
++
++#if TRACE_ALLOC
++    printf("%s: Alloc %#x bytes @ h=%d\n", __func__, alloc_size, zp->vcsm_handle);
++#endif
++
++    zp->numbytes = alloc_size;
++    zp->pool = pool;
++    return zp;
++
++fail1:
++    av_free(zp);
++fail0:
++    return NULL;
++}
++
++static void zc_pool_ent_free(ZcPoolEnt * const zp)
++{
++#if TRACE_ALLOC
++    printf("%s: Free %#x bytes @ h=%d\n", __func__, zp->numbytes, zp->vcsm_handle);
++#endif
++
++    if (zp->vcsm_handle != 0)
++    {
++        // VC addr & handle need no dealloc
++        if (zp->map_arm != NULL)
++            vcsm_unlock_hdl(zp->vcsm_handle);
++        vcsm_free(zp->vcsm_handle);
++    }
++    av_free(zp);
++}
++
++//----------------------------------------------------------------------------
++//
++// Pool functions
++
++static void zc_pool_free_ent_list(ZcPoolEnt * p)
++{
++    while (p != NULL)
++    {
++        ZcPoolEnt * const zp = p;
++        p = p->next;
++        zc_pool_ent_free(zp);
++    }
++}
++
++static void zc_pool_flush(ZcPool * const pool)
++{
++    ZcPoolEnt * p = pool->head;
++    pool->head = NULL;
++    pool->numbytes = ~0U;
++    zc_pool_free_ent_list(p);
++}
++
++static ZcPoolEnt * zc_pool_get_ent(ZcPool * const pool, const size_t req_bytes)
++{
++    ZcPoolEnt * zp = NULL;
++    ZcPoolEnt * flush_list = NULL;
++    size_t numbytes;
++
++    pthread_mutex_lock(&pool->lock);
++
++    numbytes = pool->numbytes;
++
++    // If size isn't close then dump the pool
++    // Close in this context means within 128k
++    if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
++    {
++        flush_list = pool->head;
++        pool->head = NULL;
++        pool->numbytes = numbytes = req_bytes;
++    }
++    else if (pool->head != NULL)
++    {
++        zp = pool->head;
++        pool->head = zp->next;
++    }
++
++    pthread_mutex_unlock(&pool->lock);
++
++    zc_pool_free_ent_list(flush_list);
++
++    if (zp == NULL)
++        zp = zc_pool_ent_alloc(pool, numbytes);
++
++    return zp;
++}
++
++static void zc_pool_put_ent(ZcPoolEnt * const zp)
++{
++    ZcPool * const pool = zp == NULL ? NULL : zp->pool;
++    if (zp != NULL)
++    {
++        pthread_mutex_lock(&pool->lock);
++#if TRACE_ALLOC
++        printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->numbytes);
++#endif
++
++        if (pool->numbytes == zp->numbytes)
++        {
++            zp->next = pool->head;
++            pool->head = zp;
++            pthread_mutex_unlock(&pool->lock);
++        }
++        else
++        {
++            pthread_mutex_unlock(&pool->lock);
++            zc_pool_ent_free(zp);
++        }
++    }
++}
++
++static ZcPool *
++zc_pool_new(void)
++{
++    ZcPool * const pool = av_mallocz(sizeof(*pool));
++    if (pool == NULL)
++        return NULL;
++
++    pool->numbytes = -1;
++    pool->head = NULL;
++    pthread_mutex_init(&pool->lock, NULL);
++    return pool;
++}
++
++static void
++zc_pool_delete(ZcPool * const pool)
++{
++    if (pool != NULL)
++    {
++        pool->numbytes = -1;
++        zc_pool_flush(pool);
++        pthread_mutex_destroy(&pool->lock);
++        av_free(pool);
++    }
++}
++
++//============================================================================
++//
++// ZC implementation using above pool implementation
++//
++// Fn table fns...
++
++static void zc_pool_free_v(void * v)
++{
++    zc_pool_put_ent(v);
++}
++
++static unsigned int zc_pool_ent_vcsm_handle_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    return zp->vcsm_handle;
++}
++
++static unsigned int zc_pool_ent_vc_handle_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->vc_handle == 0)
++    {
++        if ((zp->vc_handle = vcsm_vc_hdl_from_hdl(zp->vcsm_handle)) == 0)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC handle\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->vc_handle;
++}
++
++static void * zc_pool_ent_map_arm_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->map_arm == NULL)
++    {
++        if ((zp->map_arm = vcsm_lock(zp->vcsm_handle)) == NULL)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to ARM address\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->map_arm;
++}
++
++static unsigned int zc_pool_ent_map_vc_v(void * v)
++{
++    ZcPoolEnt * zp = v;
++    if (zp->map_vc == 0)
++    {
++        if ((zp->map_vc = vcsm_vc_addr_from_hdl(zp->vcsm_handle)) == 0)
++            av_log(NULL, AV_LOG_ERROR, "%s: Failed to map VCSM handle %d to VC address\n",
++                   __func__, zp->vcsm_handle);
++    }
++    return zp->map_vc;
++}
++
++static const av_rpi_zc_buf_fn_tab_t zc_pool_buf_fns = {
++    .free        = zc_pool_free_v,
++    .vcsm_handle = zc_pool_ent_vcsm_handle_v,
++    .vc_handle   = zc_pool_ent_vc_handle_v,
++    .map_arm     = zc_pool_ent_map_arm_v,
++    .map_vc      = zc_pool_ent_map_vc_v,
++};
++
++// ZC Env fns
++
++// Delete pool
++// All buffers guaranteed freed by now
++static void
++zc_pool_delete_v(void * v)
++{
++    zc_pool_delete((ZcPool *)v);
++    rpi_mem_gpu_uninit();
++}
++
++// Allocate a new ZC buffer
++static AVBufferRef *
++zc_pool_buf_alloc(void * v, size_t size, const AVRpiZcFrameGeometry * geo)
++{
++    ZcPool * const pool = v;
++    ZcPoolEnt *const zp = zc_pool_get_ent(pool, size);
++    AVBufferRef * buf;
++
++    (void)geo;  // geo ignored here
++
++    if (zp == NULL) {
++        av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
++        goto fail0;
++    }
++
++    if ((buf = av_rpi_zc_buf(size, 0, zp, &zc_pool_buf_fns)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_buf() failed\n");
++        goto fail2;
++    }
++
++    return buf;
++
++fail2:
++    zc_pool_put_ent(zp);
++fail0:
++    return NULL;
++}
++
++// Init wrappers - the public fns
++
++AVZcEnvPtr
++av_rpi_zc_int_env_alloc(void * logctx)
++{
++    ZcEnv * zc;
++    ZcPool * pool_env;
++
++    if (rpi_mem_gpu_init(0) < 0)
++        return NULL;
++
++    if ((pool_env = zc_pool_new()) == NULL)
++        goto fail1;
++
++    if ((zc = av_rpi_zc_env_alloc(logctx, pool_env, zc_pool_buf_alloc, zc_pool_delete_v)) == NULL)
++        goto fail2;
++
++    return zc;
++
++fail2:
++    zc_pool_delete(pool_env);
++fail1:
++    rpi_mem_gpu_uninit();
++    return NULL;
++}
++
++void
++av_rpi_zc_int_env_freep(AVZcEnvPtr * zcp)
++{
++    const AVZcEnvPtr zc = *zcp;
++    *zcp = NULL;
++    if (zc != NULL)
++        av_rpi_zc_env_release(zc);
++}
++
++//============================================================================
++//
++// Geometry
++//
++// This is a separate chunck to the rest
++
++// Get mailbox fd - should be in a lock when called
++// Rely on process close to close it
++static int mbox_fd(void)
++{
++    static int fd = -1;
++    if (fd != -1)
++        return fd;
++    return (fd = mbox_open());
++}
++
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format, const unsigned int video_width, const unsigned int video_height)
++{
++    static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++
++    AVRpiZcFrameGeometry geo = {
++        .format       = format,
++        .video_width  = video_width,
++        .video_height = video_height
++    };
++
++    switch (format)
++    {
++        case AV_PIX_FMT_YUV420P:
++            geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 1;
++            geo.stripe_is_yc = 1;
++            break;
++
++        case AV_PIX_FMT_YUV420P10:
++            geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
++            geo.stride_c = geo.stride_y / 2;
++            geo.height_y = (video_height + 32 + 31) & ~31;
++            geo.height_c = geo.height_y / 2;
++            geo.planes_c = 2;
++            geo.stripes = 1;
++            geo.bytes_per_pel = 2;
++            geo.stripe_is_yc = 1;
++            break;
++
++        case AV_PIX_FMT_SAND128:
++        case AV_PIX_FMT_RPI4_8:
++        {
++            const unsigned int stripe_w = 128;
++
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.stripe_is_yc = 1;
++            if (geo.height_y * stripe_w > img.pitch)
++            {
++                // "tall" sand - all C blocks now follow Y
++                geo.height_y = img.pitch / stripe_w;
++                geo.height_c = geo.height_y;
++                geo.stripe_is_yc = 0;
++            }
++            geo.planes_c = 1;
++            geo.stripes = (video_width + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++#if 0
++            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++                   video_width, video_height,
++                   geo.stride_y, geo.stride_c,
++                   geo.height_y, geo.height_c,
++                   geo.stripes, img.pitch);
++#endif
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        case AV_PIX_FMT_RPI4_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++            if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV10COL,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = ((video_width * 4 + 2) / 3 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 1;
++            geo.stripe_is_yc = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++
++#if 0
++            printf("Req: %dx%d: stride=%d/%d, height=%d/%d, stripes=%d, img.pitch=%d\n",
++                   video_width, video_height,
++                   geo.stride_y, geo.stride_c,
++                   geo.height_y, geo.height_c,
++                   geo.stripes, img.pitch);
++#endif
++            av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
++            av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
++            break;
++        }
++
++        case AV_PIX_FMT_SAND64_16:
++        case AV_PIX_FMT_SAND64_10:
++        {
++            const unsigned int stripe_w = 128;  // bytes
++
++            static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
++            static VC_IMAGE_T img = {0};
++
++            // Given the overhead of calling the mailbox keep a stashed
++            // copy as we will almost certainly just want the same numbers again
++            // but that means we need a lock
++            pthread_mutex_lock(&sand_lock);
++
++             if (img.width != video_width || img.height != video_height)
++            {
++                VC_IMAGE_T new_img = {
++                    .type = VC_IMAGE_YUV_UV_16,
++                    .width = video_width,
++                    .height = video_height
++                };
++
++                mbox_get_image_params(mbox_fd(), &new_img);
++                img = new_img;
++            }
++
++            geo.stride_y = stripe_w;
++            geo.stride_c = stripe_w;
++            geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
++            geo.height_c = img.pitch / stripe_w - geo.height_y;
++            geo.planes_c = 1;
++            geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
++            geo.bytes_per_pel = 2;
++            geo.stripe_is_yc = 1;
++
++            pthread_mutex_unlock(&sand_lock);
++            break;
++        }
++
++        default:
++            break;
++    }
++    return geo;
++}
++
++//============================================================================
++//
++// ZC Env fns
++//
++// Frame copy fns
++
++static AVBufferRef * zc_copy(const AVZcEnvPtr zc,
++    const AVFrame * const src)
++{
++    AVFrame dest_frame;
++    AVFrame * const dest = &dest_frame;
++    unsigned int i;
++    uint8_t * psrc, * pdest;
++
++    dest->format = src->format;
++    dest->width = src->width;
++    dest->height = src->height;
++
++    if (av_rpi_zc_get_buffer(zc, dest) != 0 ||
++        av_rpi_zc_resolve_frame(dest, ZC_RESOLVE_ALLOC_VALID) != 0)
++    {
++        return NULL;
++    }
++
++    for (i = 0, psrc = src->data[0], pdest = dest->data[0];
++         i != dest->height;
++         ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
++    {
++        memcpy(pdest, psrc, dest->width);
++    }
++    for (i = 0, psrc = src->data[1], pdest = dest->data[1];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
++    {
++        memcpy(pdest, psrc, dest->width / 2);
++    }
++    for (i = 0, psrc = src->data[2], pdest = dest->data[2];
++         i != dest->height / 2;
++         ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
++    {
++        memcpy(pdest, psrc, dest->width / 2);
++    }
++
++    return dest->buf[0];
++}
++
++
++static AVBufferRef * zc_420p10_to_sand128(const AVZcEnvPtr zc,
++    const AVFrame * const src)
++{
++    assert(0);
++    return NULL;
++}
++
++
++static AVBufferRef * zc_sand64_16_to_sand128(const AVZcEnvPtr zc,
++    const AVFrame * const src, const unsigned int src_bits)
++{
++    assert(0);
++    return NULL;
++}
++
++//----------------------------------------------------------------------------
++//
++// Public info extraction calls
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data);
++
++static inline ZcBufEnv * pic_zbe_ptr(AVBufferRef *const buf)
++{
++    // Kludge where we check the free fn to check this is really
++    // one of our buffers - can't think of a better way
++    return buf == NULL || buf->buffer->free != zc_buf_env_free_cb ? NULL :
++        av_buffer_get_opaque(buf);
++}
++
++static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
++{
++    // As gmem is the first el NULL should be preserved
++    return &pic_zbe_ptr(buf)->gmem;
++}
++
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : p->vcsm_handle;
++}
++
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? -1 : p->vc_handle;
++}
++
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? 0 : zbe->offset;
++}
++
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? 0 : zbe->size_pic;
++}
++
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
++{
++    const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
++    return p == NULL ? 0 : p->numbytes;
++}
++
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref)
++{
++    const ZcBufEnv * const zbe = pic_zbe_ptr(fr_ref);
++    return zbe == NULL ? NULL : &zbe->geo;
++}
++
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logctx, const AVZcEnvPtr zc,
++    const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
++{
++    av_assert0(!maycopy || zc != NULL);
++
++    if (frame->format != AV_PIX_FMT_YUV420P &&
++        frame->format != AV_PIX_FMT_YUV420P10 &&
++        !av_rpi_is_sand_frame(frame))
++    {
++        av_log(logctx, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
++        return NULL;
++    }
++
++    if (frame->buf[1] != NULL || frame->format != expected_format)
++    {
++#if RPI_ZC_SAND_8_IN_10_BUF
++        if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
++        {
++//            av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
++            return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
++        }
++#endif
++
++        if (maycopy)
++        {
++            if (frame->buf[1] != NULL)
++                av_log(logctx, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
++            else
++                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
++
++            switch (frame->format)
++            {
++                case AV_PIX_FMT_YUV420P10:
++                    return zc_420p10_to_sand128(zc, frame);
++
++                case AV_PIX_FMT_SAND64_10:
++                    return zc_sand64_16_to_sand128(zc, frame, 10);
++
++                default:
++                    return zc_copy(zc, frame);
++            }
++        }
++        else
++        {
++            if (frame->buf[1] != NULL)
++                av_log(logctx, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
++            else
++                av_log(logctx, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
++            return NULL;
++        }
++    }
++
++    if (pic_gm_ptr(frame->buf[0]) == NULL)
++    {
++        if (maycopy)
++        {
++            av_log(logctx, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
++            return zc_copy(zc, frame);
++        }
++        else
++        {
++            av_log(logctx, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
++            return NULL;
++        }
++    }
++
++    return av_buffer_ref(frame->buf[0]);
++}
++
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
++{
++    if (fr_ref != NULL)
++    {
++        av_buffer_unref(&fr_ref);
++    }
++}
++
++//----------------------------------------------------------------------------
++
++// Extract user environment from an AVBufferRef
++void * av_rpi_zc_buf_v(AVBufferRef * const buf)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++    if (zbe != NULL && zbe->user != NULL)
++    {
++        const ZcUserBufEnv * const zub = (const ZcUserBufEnv *)zbe->user->data;
++        return zub == NULL ? NULL : zub->v;
++    }
++    return NULL;
++}
++
++// AV buffer pre-free callback
++static void zc_user_buf_free_cb(void * opaque, uint8_t * data)
++{
++    if (opaque != NULL)
++    {
++        ZcUserBufEnv * const zub = opaque;
++
++        if (zub->fn->free)
++            zub->fn->free(zub->v);
++
++        av_free(zub);
++    }
++}
++
++static void zc_buf_env_free_cb(void * opaque, uint8_t * data)
++{
++    if (opaque != NULL)
++    {
++        ZcBufEnv * const zbe = opaque;
++
++        av_buffer_unref(&zbe->user);
++
++        if (zbe->zc != NULL)
++            av_rpi_zc_env_release(zbe->zc);
++
++        pthread_cond_destroy(&zbe->cond);
++        pthread_mutex_destroy(&zbe->lock);
++        av_free(zbe);
++    }
++}
++
++
++// Wrap the various ZC bits in an AV Buffer and resolve those things we want
++// resolved now.
++// Currently we resolve everything, but in future we might not
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab)
++{
++    AVBufferRef *buf;
++    ZcUserBufEnv * zub;
++
++    if ((zub = av_malloc(sizeof(ZcUserBufEnv))) == NULL)
++        return NULL;
++
++    zub->fn = fn_tab;
++    zub->v = v;
++    zub->numbytes = numbytes;
++    zub->offset = addr_offset;
++
++    if ((buf = av_buffer_create((uint8_t*)zub, sizeof(*zub), zc_user_buf_free_cb, zub, 0)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "ZC: Failed av_buffer_create\n");
++        av_free(zub);
++        return NULL;
++    }
++
++    return buf;
++}
++
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int alloc_mode)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(buf);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    if (alloc_mode == ZC_RESOLVE_FAIL && !zbe->is_valid)
++        return AVERROR(EAGAIN);
++
++    if (alloc_mode == ZC_RESOLVE_WAIT_VALID && !zbe->is_valid)
++    {
++        pthread_mutex_lock(&zbe->lock);
++        while (!zbe->is_valid)
++            pthread_cond_wait(&zbe->cond, &zbe->lock);
++        pthread_mutex_unlock(&zbe->lock);
++    }
++
++    if (zbe->is_valid == ZC_BUF_NEVER)
++        return AVERROR(EINVAL);
++
++    // Do alloc if we need it
++    if (zbe->user == NULL)
++    {
++        ZcEnv * const zc = zbe->zc;
++        const ZcUserBufEnv * zub;
++
++        av_assert0(alloc_mode == ZC_RESOLVE_ALLOC || alloc_mode == ZC_RESOLVE_ALLOC_VALID);
++
++        if ((zbe->user = zc->alloc_buf(zc->pool_env, zbe->size_pic, &zbe->geo)) == NULL)
++        {
++            av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++            goto fail;
++        }
++        zub = (const ZcUserBufEnv *)zbe->user->data;
++
++        // Track
++
++        zbe->offset = zub->offset;
++        zbe->gmem.numbytes = zub->numbytes;
++        if ((zbe->gmem.arm =  zub->fn->map_arm(zub->v)) == NULL)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to lock vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++
++        if ((zbe->gmem.vcsm_handle = zub->fn->vcsm_handle(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vcsm_handle\n");
++            goto fail;
++        }
++
++        if ((zbe->gmem.vc_handle = zub->fn->vc_handle(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc handle from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++        if ((zbe->gmem.vc = zub->fn->map_vc(zub->v)) == 0)
++        {
++            av_log(NULL, AV_LOG_ERROR, "ZC: Failed to get vc addr from vcsm_handle %u\n", zbe->gmem.vcsm_handle);
++            goto fail;
++        }
++
++        buf->buffer->data = zbe->gmem.arm + zbe->offset;
++        buf->buffer->size = zbe->size_pic;
++
++        // In this mode we shouldn't have anyone waiting for us
++        // so no need to signal
++        if (alloc_mode == ZC_RESOLVE_ALLOC_VALID)
++            zbe->is_valid = 1;
++    }
++
++    // Just overwrite - no point in testing
++    buf->data = zbe->gmem.arm + zbe->offset;
++    buf->size = zbe->size_pic;
++    return 0;
++
++fail:
++    av_buffer_unref(&zbe->user);
++    return AVERROR(ENOMEM);
++}
++
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc)
++{
++    int rv;
++
++    // Do alloc if we need it
++    if ((rv = av_rpi_zc_resolve_buffer(frame->buf[0], may_alloc)) != 0)
++        return rv;
++
++    // If we are a framebuf copy then the alloc can be done but we haven't
++    // imported its results yet
++    if (frame->data[0] == NULL)
++    {
++        const ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++        frame->linesize[0] = zbe->geo.stride_y;
++        frame->linesize[1] = zbe->geo.stride_c;
++        frame->linesize[2] = zbe->geo.stride_c;
++        // abuse: linesize[3] = "stripe stride"
++        // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++        // In a general case this makes the calculation an xor and multiply rather
++        // than a divide and multiply
++        if (zbe->geo.stripes > 1)
++            frame->linesize[3] = zbe->geo.stripe_is_yc ? zbe->geo.height_y + zbe->geo.height_c : zbe->geo.height_y;
++
++        frame->data[0] = frame->buf[0]->data;
++        frame->data[1] = frame->data[0] + (zbe->geo.stripe_is_yc ? zbe->size_y : zbe->size_y * zbe->geo.stripes);
++        if (zbe->geo.planes_c > 1)
++            frame->data[2] = frame->data[1] + zbe->size_c;
++
++        frame->extended_data = frame->data;
++        // Leave extended buf alone
++    }
++
++    return 0;
++}
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    zbe->is_valid = ZC_BUF_VALID;
++    pthread_cond_broadcast(&zbe->cond);
++
++    return 0;
++}
++
++int av_rpi_zc_set_broken_frame(AVFrame * const frame)
++{
++    ZcBufEnv * const zbe = pic_zbe_ptr(frame->buf[0]);
++
++    if (zbe == NULL)
++        return AVERROR(EINVAL);
++
++    zbe->is_valid = ZC_BUF_NEVER;
++    pthread_cond_broadcast(&zbe->cond);
++
++    return 0;
++}
++
++void av_rpi_zc_set_decoder_pool_size(ZcEnv *const zc, const unsigned int pool_size)
++{
++    zc->pool_size = pool_size;
++}
++
++unsigned int av_rpi_zc_get_decoder_pool_size(ZcEnv *const zc)
++{
++    return zc->pool_size;
++}
++
++int av_rpi_zc_get_buffer(ZcEnv *const zc, AVFrame * const frame)
++{
++#if 1
++    ZcBufEnv * zbe = av_mallocz(sizeof(*zbe));
++
++    for (unsigned int i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++        frame->buf[i] = NULL;
++        frame->data[i] = NULL;
++        frame->linesize[i] = 0;
++    }
++
++    if (zbe == NULL)
++        return AVERROR(ENOMEM);
++
++    if ((frame->buf[0] = av_buffer_create((uint8_t *)zbe, sizeof(*zbe), zc_buf_env_free_cb, zbe, 0)) == NULL)
++    {
++        av_free(zbe);
++        return AVERROR(ENOMEM);
++    }
++
++    pthread_mutex_init(&zbe->lock, NULL);
++    pthread_cond_init(&zbe->cond, NULL);
++    zbe->zc = zc;
++    atomic_fetch_add(&zc->refcount, 1);
++
++    zbe->geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);  // Note geometry for later use
++    zbe->size_y = zbe->geo.stride_y * zbe->geo.height_y;
++    zbe->size_c = zbe->geo.stride_c * zbe->geo.height_c;
++    zbe->size_pic = (zbe->size_y + zbe->size_c * zbe->geo.planes_c) * zbe->geo.stripes;
++
++#else
++    const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
++    const unsigned int size_y = geo.stride_y * geo.height_y;
++    const unsigned int size_c = geo.stride_c * geo.height_c;
++    const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
++    AVBufferRef * buf;
++    unsigned int i;
++
++//    printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
++
++    if ((buf = zc->alloc_buf(zc->pool_env, size_pic, &geo)) == NULL)
++    {
++        av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
++        return AVERROR(ENOMEM);
++    }
++
++    // Track
++    atomic_fetch_add(&zc->refcount, 1);
++    pic_zbe_ptr(buf)->zc = zc;
++
++    for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
++        frame->buf[i] = NULL;
++        frame->data[i] = NULL;
++        frame->linesize[i] = 0;
++    }
++
++    frame->buf[0] = buf;
++
++    frame->linesize[0] = geo.stride_y;
++    frame->linesize[1] = geo.stride_c;
++    frame->linesize[2] = geo.stride_c;
++    // abuse: linesize[3] = "stripe stride"
++    // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
++    // In a general case this makes the calculation an xor and multiply rather
++    // than a divide and multiply
++    if (geo.stripes > 1)
++        frame->linesize[3] = geo.stripe_is_yc ? geo.height_y + geo.height_c : geo.height_y;
++
++    frame->data[0] = buf->data;
++    frame->data[1] = frame->data[0] + (geo.stripe_is_yc ? size_y : size_y * geo.stripes);
++    if (geo.planes_c > 1)
++        frame->data[2] = frame->data[1] + size_c;
++
++    frame->extended_data = frame->data;
++    // Leave extended buf alone
++
++#if RPI_ZC_SAND_8_IN_10_BUF != 0
++    // *** If we intend to use this for real we will want a 2nd buffer pool
++    frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = zc_pool_buf_alloc(&zc->pool, size_pic);  // *** 2 * wanted size - kludge
++#endif
++#endif
++
++    return 0;
++}
++
++void av_rpi_zc_env_release(const AVZcEnvPtr zc)
++{
++    const int n = atomic_fetch_add(&zc->refcount, -1);
++    if (n == 1)  // was 1, now 0
++    {
++        zc->free_pool(zc->pool_env);
++        av_free(zc);
++    }
++}
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++    ZcEnv * zc;
++
++    if ((zc = av_mallocz(sizeof(ZcEnv))) == NULL)
++    {
++        av_log(logctx, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
++        return NULL;
++    }
++
++    *zc = (ZcEnv){
++        .refcount = ATOMIC_VAR_INIT(1),
++        .pool_env = pool_env,
++        .alloc_buf = alloc_buf_fn,
++        .free_pool = free_pool_fn,
++        .pool_size = 0
++    };
++
++    return zc;
++}
++
++//============================================================================
++//
++// External ZC initialisation
++
++#define RPI_GET_BUFFER2 1
++
++
++static int zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
++{
++#if !RPI_GET_BUFFER2
++    return avcodec_default_get_buffer2(s, frame, flags);
++#else
++    int rv;
++
++    if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
++    {
++//        printf("Do default alloc: format=%#x\n", frame->format);
++        rv = avcodec_default_get_buffer2(s, frame, flags);
++    }
++    else if (frame->format == AV_PIX_FMT_YUV420P ||
++             av_rpi_is_sand_frame(frame))
++    {
++        if ((rv = av_rpi_zc_get_buffer(s->opaque, frame)) == 0)
++            rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_ALLOC_VALID);
++    }
++    else
++    {
++        rv = avcodec_default_get_buffer2(s, frame, flags);
++    }
++
++#if 0
++    printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
++        frame->format, frame->width, frame->height,
++        frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
++        frame->data[0], frame->data[1], frame->data[2],
++        frame->buf[0], frame->buf[1], frame->buf[2],
++        av_buffer_get_opaque(frame->buf[0]));
++#endif
++    return rv;
++#endif
++}
++
++int av_rpi_zc_in_use(const struct AVCodecContext * const s)
++{
++    return s->get_buffer2 == zc_get_buffer2;
++}
++
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn)
++{
++    ZcEnv * zc;
++
++    av_assert0(!av_rpi_zc_in_use(s));
++
++    if ((zc = av_rpi_zc_env_alloc(s, pool_env, alloc_buf_fn, free_pool_fn)) == NULL)
++        return AVERROR(ENOMEM);
++
++    zc->old = (ZcOldCtxVals){
++        .opaque = s->opaque,
++        .get_buffer2 = s->get_buffer2,
++        .thread_safe_callbacks = s->thread_safe_callbacks
++    };
++
++    s->opaque = zc;
++    s->get_buffer2 = zc_get_buffer2;
++    s->thread_safe_callbacks = 1;
++    return 0;
++}
++
++void av_rpi_zc_uninit2(struct AVCodecContext * const s)
++{
++    ZcEnv * const zc = s->opaque;
++
++    av_assert0(av_rpi_zc_in_use(s));
++
++    s->get_buffer2 = zc->old.get_buffer2;
++    s->opaque = zc->old.opaque;
++    s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
++
++    av_rpi_zc_env_release(zc);
++}
++
+diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
+new file mode 100644
+index 0000000000..f00a7c962c
+--- /dev/null
++++ b/libavcodec/rpi_zc.h
+@@ -0,0 +1,228 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef LIBAVCODEC_RPI_ZC_H
++#define LIBAVCODEC_RPI_ZC_H
++
++// Zero-Copy frame code for RPi
++// RPi needs Y/U/V planes to be contiguous for display.  By default
++// ffmpeg will allocate separated planes so a memcpy is needed before
++// display.  This code provides a method a making ffmpeg allocate a single
++// bit of memory for the frame when can then be reference counted until
++// display has finished with it.
++
++// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
++// 0 disables
++// *** This option still in development
++//     Only works if SAO active
++//     Allocates buffers that are twice the required size
++#define RPI_ZC_SAND_8_IN_10_BUF  0
++
++struct AVBufferRef;
++struct AVFrame;
++struct AVCodecContext;
++enum AVPixelFormat;
++
++// "Opaque" pointer to whatever we are using as a buffer reference
++typedef struct AVBufferRef * AVRpiZcRefPtr;
++
++struct AVZcEnv;
++typedef struct AVZcEnv * AVZcEnvPtr;
++
++typedef struct AVRpiZcFrameGeometry
++{
++    unsigned int stride_y;  // Luma stride (bytes)
++    unsigned int height_y;  // Luma height (lines)
++    unsigned int stride_c;  // Chroma stride (bytes)
++    unsigned int height_c;  // Chroma stride (lines)
++    unsigned int planes_c;  // Chroma plane count (U, V = 2, interleaved = 1)
++    unsigned int stripes;   // Number of stripes (sand)
++    unsigned int bytes_per_pel;
++    int stripe_is_yc;       // A single stripe is Y then C (false for tall sand)
++
++    int format;                 // Requested format
++    unsigned int video_width;   // Requested width
++    unsigned int video_height;  // Requested height
++} AVRpiZcFrameGeometry;
++
++// Get expected MMAL geometry for a given format, width & height
++AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
++    const int format,
++    const unsigned int video_width, const unsigned int video_height);
++
++//----------------------------------------------------------------------------
++//
++// Calls that extract info from a ZC frame whether internally or externally
++// allocated
++
++// Generate a ZC reference to the buffer(s) in this frame
++// If the buffer doesn't appear to be one allocated by ZC
++// then the behaviour depends on maycopy:
++//   If maycopy=0 then return NULL
++//   If maycopy=1 && the src frame is in a form where we can easily copy
++//     the data, then allocate a new buffer and copy the data into it
++//   Otherwise return NULL
++// If maycopy == 0 then ZC may be NULL
++AVRpiZcRefPtr av_rpi_zc_ref(void * const logging_context, const AVZcEnvPtr zc,
++    const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
++
++// Unreference the buffer refed/allocated by _zc_ref
++// If fr_ref is NULL then this will NOP
++void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
++
++// Get the vc_handle from the frame ref
++// Returns -1 if ref doesn't look valid
++int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
++// Get the vcsm_handle from the frame ref
++// Returns 0 if ref doesn't look valid
++unsigned int av_rpi_zc_vcsm_handle(const AVRpiZcRefPtr fr_ref);
++// Get offset from the start of the memory referenced
++// by the vc_handle to valid data
++int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
++// Length of buffer data
++int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
++// Get the number of bytes allocated from the frame ref
++// Returns 0 if ref doesn't look valid
++int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
++// Geometry this frame was allocated with
++const AVRpiZcFrameGeometry * av_rpi_zc_geometry(const AVRpiZcRefPtr fr_ref);
++
++//----------------------------------------------------------------------------
++//
++// Calls for external frame allocation
++
++// Callbacks registered in av_rpi_zc_init2
++
++// Callback to allocate a buf for a frame
++// The frame itself is generated in the calling code
++//
++// Parameters:
++//   pool_env  value passed to av-rpi_zc_init2
++//   size      size wanted
++//   geo       geometry of the frame to be allocated
++// Returns:
++//   NULL      Alloc failed
++//   ptr       AVBufferBuf* of allocated buffer
++//             In most cases av_rpi_zc_buf will be called by this function
++//             and this will be the buf returned by that.
++typedef AVBufferRef * av_rpi_zc_alloc_buf_fn_t(void * pool_env, size_t size,
++                                               const AVRpiZcFrameGeometry * geo);
++
++// Callback once ffmpeg is completely done with this pool
++// Called once all allocated buffers have been derefed and ffmpegs ref to this
++// pool has been dropped
++typedef void av_rpi_zc_free_pool_fn_t(void * pool_env);
++
++// Init ZC into a context
++// Sets opaque, get_buffer2, thread_safe_callbacks
++// Use if you want to allocate your own pools and/or create ZC buffers for
++// all decoders
++// RPI HEVC decoders will allocate appropriate VCSM buffers which can be taken
++// apart by av_rpi_zc_xxx calls without this
++int av_rpi_zc_init2(struct AVCodecContext * const s,
++                    void * pool_env, av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
++
++// Free ZC from a context
++void av_rpi_zc_uninit2(struct AVCodecContext * const s);
++
++// Get minimum pool size in frames - valid by the time the first alloc request
++// occurs.  Takes into account thread requests and DPB sizes derived from SPS
++// rather than just adding a worst case DPB size.
++unsigned int av_rpi_zc_get_decoder_pool_size(const AVZcEnvPtr zc);
++
++typedef struct av_rpi_zc_buf_fn_tab_s {
++    // This AVBuffer is being freed by ffmpeg - return memory
++    // to external pool. Memory may be, but need not be, unmapped.
++    // v is the ptr passed in av_rpi_zc_buf
++    void (* free)(void * v);
++
++    // Return appropriate handles / mappings
++    // v is the ptr passed in av_rpi_zc_buf
++    unsigned int (* vcsm_handle)(void * v);
++    unsigned int (* vc_handle)(void * v);
++    void * (* map_arm)(void * v);
++    unsigned int (* map_vc)(void * v);
++} av_rpi_zc_buf_fn_tab_t;
++
++// Allocate a ZC AVBufferRef and set its callback table
++// Doesn't take a buffer address directly - relies on callbacks to return
++// addresses as they are required.  Mappings need not be generated until
++// the map callbacks are called but they should persist from then until
++// the buffer is freed.
++//
++// Parameters:
++//   numbytes    Size of the buffer
++//   addr_offset Offset to first usable byte of buffer (for alignment)
++//               normally 0
++//   v           Pointer passed to callbacks
++//   fn_tab      Function table
++AVBufferRef * av_rpi_zc_buf(size_t numbytes, int addr_offset, void * v, const av_rpi_zc_buf_fn_tab_t * fn_tab);
++
++// Get v ptr set in in av_rpi_zc_buf
++void * av_rpi_zc_buf_v(AVBufferRef * const buf);
++
++//----------------------------------------------------------------------------
++//
++// Mostly internal calls but might possibly be wanted by outside code
++
++void av_rpi_zc_int_env_freep(AVZcEnvPtr * zc);
++AVZcEnvPtr av_rpi_zc_int_env_alloc(void * const logctx);
++void av_rpi_zc_set_decoder_pool_size(const AVZcEnvPtr zc, const unsigned int pool_size);
++
++// Test to see if the context is using zc (checks get_buffer2)
++int av_rpi_zc_in_use(const struct AVCodecContext * const s);
++
++// Get buffer generates placeholders for later alloc
++int av_rpi_zc_get_buffer(const AVZcEnvPtr zc, AVFrame * const frame);
++// Resolve actually does the alloc (noop if already alloced)
++// Set data pointers on a buffer/frame that was copied before the alloc
++// accured
++#define ZC_RESOLVE_FAIL         0  // return error on invalid
++#define ZC_RESOLVE_ALLOC        1  // alloc as invalid
++#define ZC_RESOLVE_WAIT_VALID   2  // wait for valid
++#define ZC_RESOLVE_ALLOC_VALID  3  // alloc as valid
++int av_rpi_zc_resolve_buffer(AVBufferRef * const buf, const int may_alloc);
++int av_rpi_zc_resolve_frame(AVFrame * const frame, const int may_alloc);
++
++int av_rpi_zc_set_valid_frame(AVFrame * const frame);
++int av_rpi_zc_set_broken_frame(AVFrame * const frame);
++
++
++
++
++AVZcEnvPtr av_rpi_zc_env_alloc(void * logctx,
++                    void * pool_env,
++                    av_rpi_zc_alloc_buf_fn_t * alloc_buf_fn,
++                    av_rpi_zc_free_pool_fn_t * free_pool_fn);
++void av_rpi_zc_env_release(const AVZcEnvPtr zc);
++
++
++#endif
++
+diff --git a/libavcodec/rpi_zc_frames.h b/libavcodec/rpi_zc_frames.h
+new file mode 100644
+index 0000000000..9b7b6536a4
+--- /dev/null
++++ b/libavcodec/rpi_zc_frames.h
+@@ -0,0 +1,142 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox, Ben Avison
++*/
++
++#ifndef RPI_ZC_FRAMES_H
++#define RPI_ZC_FRAMES_H
++
++#define RPI_ONE_BUF 1
++
++#include "rpi_mem.h"  // for GPU_MEM_PTR_T
++#include "libavutil/frame.h"
++
++#if !RPI_ONE_BUF
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++    return p->vc;
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++    return p->vc;
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    GPU_MEM_PTR_T *p = av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++    return p->vc;
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[1]);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    return *(GPU_MEM_PTR_T *)av_buffer_pool_buffer_get_opaque(frame->buf[2]);
++}
++
++#else
++
++static inline int gpu_is_buf1(const AVFrame * const frame)
++{
++    return frame->buf[1] == NULL;
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
++{
++    return av_buffer_get_opaque(frame->buf[0]);
++}
++
++static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
++{
++    return av_buffer_pool_buffer_get_opaque(frame->buf[n]);
++}
++
++static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
++{
++    const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
++    return gm->vc + (frame->data[n] - gm->arm);
++}
++
++
++static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
++    return get_vc_address3(frame, 0);
++}
++
++static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
++    return get_vc_address3(frame, 1);
++}
++
++static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
++    return get_vc_address3(frame, 2);
++}
++
++#if 0
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.numbytes = frame->data[1] - frame->data[0];
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 0);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[1] - frame->data[0];
++        g.vc += frame->data[1] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 1);
++}
++
++static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
++    if (gpu_is_buf1(frame))
++    {
++        GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
++        g.arm += frame->data[2] - frame->data[0];
++        g.vc += frame->data[2] - frame->data[0];
++        g.numbytes = frame->data[2] - frame->data[1];  // chroma size
++        return g;
++    }
++    else
++        return *gpu_buf3_gmem(frame, 2);
++}
++#endif
++#endif
++
++#endif
+diff --git a/libavcodec/rpivid_hevc.c b/libavcodec/rpivid_hevc.c
+new file mode 100644
+index 0000000000..85c5b46d75
+--- /dev/null
++++ b/libavcodec/rpivid_hevc.c
+@@ -0,0 +1,2128 @@
++// FFMPEG HEVC decoder hardware accelerator
++// Andrew Holme, Argon Design Ltd
++// Copyright (c) June 2017 Raspberry Pi Ltd
++
++#include <stdio.h>
++#include <fcntl.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <unistd.h>
++#include <sys/mman.h>
++
++#include "fftools/ffmpeg.h"
++#include "libavutil/avassert.h"
++#include "libavutil/imgutils.h"
++#include "avcodec.h"
++#include "hwconfig.h"
++#include "decode.h"
++
++#include "hevc.h"
++#include "hevcdec.h"
++#include "rpi_zc.h"
++#include "rpi_mem.h"
++#include "rpi_zc_frames.h"
++#include "rpi_mailbox.h"
++
++
++#define OPT_PHASE_TIMING 0      // Generate stats for phase usage
++
++#define OPT_EMU 0
++
++#define TRACE_DEV 0
++#define TRACE_ENTRY 0
++
++#define NUM_SCALING_FACTORS 4064
++
++#define AXI_BASE64 0
++
++#define PROB_BACKUP ((20<<12) + (20<<6) + (0<<0))
++#define PROB_RELOAD ((20<<12) + (20<<0) + (0<<6))
++
++#define RPIVID_COL_PICS 17                 // 16 ref & current
++
++#define RPIVID_BITBUFS          2          // Bit + Cmd bufs (phase 0 & 1)
++#define RPIVID_BITBUF_SIZE      (4 << 20)  // Bit + Cmd buf size
++
++#define RPIVID_COEFFBUFS        3          // PU + Coeff bufs (phase 1 & 2)
++#define RPIVID_COEFFBUF_SIZE    (16 << 20) // PU + Coeff buf size
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// Register offsets
++
++#define RPI_SPS0         0
++#define RPI_SPS1         4
++#define RPI_PPS          8
++#define RPI_SLICE        12
++#define RPI_TILESTART    16
++#define RPI_TILEEND      20
++#define RPI_SLICESTART   24
++#define RPI_MODE         28
++#define RPI_LEFT0        32
++#define RPI_LEFT1        36
++#define RPI_LEFT2        40
++#define RPI_LEFT3        44
++#define RPI_QP           48
++#define RPI_CONTROL      52
++#define RPI_STATUS       56
++#define RPI_VERSION      60
++#define RPI_BFBASE       64
++#define RPI_BFNUM        68
++#define RPI_BFCONTROL    72
++#define RPI_BFSTATUS     76
++#define RPI_PUWBASE      80
++#define RPI_PUWSTRIDE    84
++#define RPI_COEFFWBASE   88
++#define RPI_COEFFWSTRIDE 92
++#define RPI_SLICECMDS    96
++#define RPI_BEGINTILEEND 100
++#define RPI_TRANSFER     104
++#define RPI_CFBASE       108
++#define RPI_CFNUM        112
++#define RPI_CFSTATUS     116
++
++#define RPI_PURBASE       0x8000
++#define RPI_PURSTRIDE     0x8004
++#define RPI_COEFFRBASE    0x8008
++#define RPI_COEFFRSTRIDE  0x800C
++#define RPI_NUMROWS       0x8010
++#define RPI_CONFIG2       0x8014
++#define RPI_OUTYBASE      0x8018
++#define RPI_OUTYSTRIDE    0x801C
++#define RPI_OUTCBASE      0x8020
++#define RPI_OUTCSTRIDE    0x8024
++#define RPI_STATUS2       0x8028
++#define RPI_FRAMESIZE     0x802C
++#define RPI_MVBASE        0x8030
++#define RPI_MVSTRIDE      0x8034
++#define RPI_COLBASE       0x8038
++#define RPI_COLSTRIDE     0x803C
++#define RPI_CURRPOC       0x8040
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Unused but left here to illustrate the diffrences between FFmpegs prob
++// structure and the rpivid one
++
++struct FFM_PROB {
++    uint8_t  sao_merge_flag                   [ 1];
++    uint8_t  sao_type_idx                     [ 1];
++    uint8_t  split_coding_unit_flag           [ 3];
++    uint8_t  cu_transquant_bypass_flag        [ 1];
++    uint8_t  skip_flag                        [ 3];
++    uint8_t  cu_qp_delta                      [ 3];
++    uint8_t  pred_mode_flag                   [ 1];
++    uint8_t  part_mode                        [ 4];
++    uint8_t  prev_intra_luma_pred_flag        [ 1];
++    uint8_t  intra_chroma_pred_mode           [ 2];
++    uint8_t  merge_flag                       [ 1];
++    uint8_t  merge_idx                        [ 1];
++    uint8_t  inter_pred_idc                   [ 5];
++    uint8_t  ref_idx_l0                       [ 2];
++    uint8_t  ref_idx_l1                       [ 2];
++    uint8_t  abs_mvd_greater0_flag            [ 2];
++    uint8_t  abs_mvd_greater1_flag            [ 2];
++    uint8_t  mvp_lx_flag                      [ 1];
++    uint8_t  no_residual_data_flag            [ 1];
++    uint8_t  split_transform_flag             [ 3];
++    uint8_t  cbf_luma                         [ 2];
++    uint8_t  cbf_cb_cr                        [ 4];
++    uint8_t  transform_skip_flag/*[][]*/      [ 2];
++    uint8_t  explicit_rdpcm_flag/*[][]*/      [ 2];
++    uint8_t  explicit_rdpcm_dir_flag/*[][]*/  [ 2];
++    uint8_t  last_significant_coeff_x_prefix  [18];
++    uint8_t  last_significant_coeff_y_prefix  [18];
++    uint8_t  significant_coeff_group_flag     [ 4];
++    uint8_t  significant_coeff_flag           [44];
++    uint8_t  coeff_abs_level_greater1_flag    [24];
++    uint8_t  coeff_abs_level_greater2_flag    [ 6];
++    uint8_t  log2_res_scale_abs               [ 8];
++    uint8_t  res_scale_sign_flag              [ 2];
++    uint8_t  cu_chroma_qp_offset_flag         [ 1];
++    uint8_t  cu_chroma_qp_offset_idx          [ 1];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_PROB {
++    uint8_t  SAO_MERGE_FLAG             [ 1];
++    uint8_t  SAO_TYPE_IDX               [ 1];
++    uint8_t  SPLIT_FLAG                 [ 3];
++    uint8_t  CU_SKIP_FLAG               [ 3];
++    uint8_t  CU_TRANSQUANT_BYPASS_FLAG  [ 1];
++    uint8_t  PRED_MODE                  [ 1];
++    uint8_t  PART_SIZE                  [ 4];
++    uint8_t  INTRA_PRED_MODE            [ 1];
++    uint8_t  CHROMA_PRED_MODE           [ 1];
++    uint8_t  MERGE_FLAG_EXT             [ 1];
++    uint8_t  MERGE_IDX_EXT              [ 1];
++    uint8_t  INTER_DIR                  [ 5];
++    uint8_t  REF_PIC                    [ 2];
++    uint8_t  MVP_IDX                    [ 1];
++    uint8_t  MVD                        [ 2];
++    uint8_t  QT_ROOT_CBF                [ 1];
++    uint8_t  TRANS_SUBDIV_FLAG          [ 3];
++    uint8_t  QT_CBF                     [ 6];
++    uint8_t  DQP                        [ 2];
++    uint8_t  ONE_FLAG                   [24];
++    uint8_t  LASTX                      [18];
++    uint8_t  LASTY                      [18];
++    uint8_t  SIG_CG_FLAG                [ 4];
++    uint8_t  ABS_FLAG                   [ 6];
++    uint8_t  TRANSFORMSKIP_FLAG         [ 2];
++    uint8_t  SIG_FLAG                   [42];
++    uint8_t  SIG_FLAG_unused            [ 2];
++} __attribute__((packed));
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_CMD {
++    uint32_t addr;
++    uint32_t data;
++} __attribute__((packed));
++
++struct RPI_BIT {
++    int         cmd;
++    const void *ptr;
++    int         len;
++};
++
++//////////////////////////////////////////////////////////////////////////////
++
++struct RPI_T;
++
++// Actual addressability is 38bits but we can only alloc in the bottom 32
++// currently - when passed to rpivid h/w the address is always >> 6 so will
++// fit in 32 bit there
++// At some point we may weant to make this uint64_t
++typedef uint32_t vid_vc_addr_t;
++
++typedef enum rpivid_decode_state_e {
++    RPIVID_DECODE_NEW = 0,
++    RPIVID_DECODE_START,
++    RPIVID_DECODE_SLICE,
++    RPIVID_DECODE_END,
++} rpivid_decode_state_t;
++
++#define RPI_PROB_VALS 154U
++#define RPI_PROB_ARRAY_SIZE ((154 + 3) & ~3)
++
++typedef struct dec_env_s {
++    const AVCodecContext * avctx;
++
++    rpivid_decode_state_t state;
++    unsigned int    decode_order;
++
++    int             phase_no;           // Current phase (i.e. the last one we waited for)
++    struct dec_env_s * phase_wait_q_next;
++    sem_t           phase_wait;
++
++    struct RPI_BIT *bit_fifo;
++    struct RPI_CMD *cmd_fifo;
++    unsigned int    bit_len, bit_max;
++    unsigned int    cmd_len, cmd_max;
++    unsigned int    num_slice_msgs;
++    unsigned int    PicWidthInCtbsY;
++    unsigned int    PicHeightInCtbsY;
++    unsigned int    dpbno_col;
++    uint32_t        reg_slicestart;
++    unsigned int    wpp_entry_x;
++    unsigned int    wpp_entry_y;
++
++    const uint8_t * nal_buffer;
++    size_t          nal_size;
++
++    uint16_t        slice_msgs[2*HEVC_MAX_REFS*8+3];
++    uint8_t         scaling_factors[NUM_SCALING_FACTORS];
++//    unsigned int    RefPicList[2][HEVC_MAX_REFS];
++} dec_env_t;
++
++#define RPIVID_PHASES 3
++#define RPIVID_PHASE_NEW (RPIVID_PHASES) // Phase before we have inced decode order
++#define RPIVID_PHASE_START (-1)          // Phase after we have inced decode_order
++
++#if OPT_PHASE_TIMING
++static const unsigned int time_thresholds[8] = {
++    10, 15, 20, 30, 45, 60, 75, 90
++};
++#endif
++
++typedef struct phase_wait_env_s {
++    unsigned int    last_order;
++    dec_env_t *     q;
++#if OPT_PHASE_TIMING
++    uint64_t phase_time;
++    uint64_t max_phase_time;
++    uint64_t time_in_phase;
++    uint64_t time_out_phase;
++    unsigned int max_time_decode_order;
++    unsigned int time_bins[9];
++    unsigned int time_bins3[9];
++    unsigned int time_bins5[9];
++    uint64_t time_stash[16];
++    unsigned int i3;
++#endif
++} phase_wait_env_t;                      // Single linked list of threads waiting for this phase
++
++typedef struct RPI_T {
++    atomic_int      ref_count;
++    sem_t           ref_zero;
++
++    dec_env_t **    dec_envs;
++    AVZcEnvPtr      zc;
++
++    pthread_mutex_t phase_lock;
++    phase_wait_env_t phase_reqs[RPIVID_PHASES];
++
++    volatile uint32_t * regs;
++    volatile uint32_t * ints;
++
++    GPU_MEM_PTR_T   gcolbuf;
++    unsigned int    col_stride;
++    size_t          col_picsize;
++
++    unsigned int    bitbuf_no;
++    sem_t           bitbuf_sem;
++    GPU_MEM_PTR_T   gbitbufs[RPIVID_BITBUFS];
++
++    unsigned int    max_pu_msgs;
++    unsigned int    coeffbuf_no;
++    sem_t           coeffbuf_sem;
++    GPU_MEM_PTR_T   gcoeffbufs[RPIVID_COEFFBUFS];
++
++    unsigned int    decode_order;
++    int             mbox_fd;
++    int             gpu_init_type;
++} RPI_T;
++
++#if OPT_PHASE_TIMING
++static uint64_t tus64(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (uint64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++#endif
++
++static inline unsigned int rnd64(unsigned int x)
++{
++    return (x + 63) & ~63;
++}
++
++static inline int rpi_sem_wait(sem_t * const sem)
++{
++    int rv;
++    while ((rv = sem_wait(sem)) != 0 && errno == EINTR)
++        /* Loop */;
++    return rv;
++}
++
++//============================================================================
++
++#define REGS_NAME "/dev/rpivid-hevcmem"
++#define REGS_SIZE 0x10000
++#define INTS_NAME "/dev/rpivid-intcmem"
++#define INTS_SIZE 0x10000  // 4 is probably enough but we are going to alloc a page anyway
++
++static volatile uint32_t * map_dev(AVCodecContext * const avctx, const char * const dev_name, size_t size)
++{
++    void *gpio_map;
++    int  mem_fd;
++
++    /* open /dev/mem */
++    if ((mem_fd = open(dev_name, O_RDWR|O_SYNC) ) < 0) {
++        av_log(avctx, AV_LOG_WARNING, "can't open %s\n", dev_name);
++        return NULL;
++    }
++
++    // Now map it
++    gpio_map = mmap(
++       NULL,
++       size,
++       PROT_READ|PROT_WRITE,
++       MAP_SHARED,
++       mem_fd,
++       0
++    );
++
++    close(mem_fd);  // No longer need the FD
++
++    if (gpio_map == MAP_FAILED) {
++        av_log(avctx, AV_LOG_WARNING, "GPIO mapping failed");
++        return NULL;
++    }
++
++    return (volatile uint32_t *)gpio_map;
++}
++
++static void unmap_devp(volatile uint32_t ** const p_gpio_map, size_t size)
++{
++    volatile uint32_t * const gpio_map = *p_gpio_map;
++    if (gpio_map != NULL) {
++        *p_gpio_map = NULL;
++        munmap((void *)gpio_map, size);
++    }
++}
++
++#define MANGLE(x) ((x) &~0xc0000000)          // ** If x is ever a 64 bit thing this will need fixing!
++#define MANGLE64(x) (uint32_t)(MANGLE(x)>>6)
++
++static inline void apb_write_vc_addr(const RPI_T *const rpi, const uint32_t addr, const vid_vc_addr_t data)
++{
++#if TRACE_DEV
++    printf("W %x %08x\n", addr, MANGLE64(data));
++#endif
++
++    rpi->regs[addr >> 2] = MANGLE64(data);
++}
++
++static inline void apb_write_vc_len(const RPI_T *const rpi, const uint32_t addr, const unsigned int data)
++{
++#if TRACE_DEV
++    printf("W %x %08x\n", addr, data >> 6);
++#endif
++
++    rpi->regs[addr >> 2] = data >> 6;  // ?? rnd64 - but not currently needed
++}
++
++static inline void apb_write(const RPI_T * const rpi, const uint32_t addr, const uint32_t data)
++{
++#if TRACE_DEV
++    printf("W %x %08x\n", addr, data);
++#endif
++
++    rpi->regs[addr >> 2] = data;
++}
++
++static inline uint32_t apb_read(const RPI_T * const rpi, const uint32_t addr)
++{
++    const uint32_t v = rpi->regs[addr >> 2];
++#if TRACE_DEV
++    printf("R %x (=%x)\n", addr, v);
++#endif
++    return v;
++}
++
++#define ARG_IC_ICTRL_ACTIVE1_INT_SET                   0x00000001
++#define ARG_IC_ICTRL_ACTIVE1_EDGE_SET                  0x00000002
++#define ARG_IC_ICTRL_ACTIVE1_EN_SET                    0x00000004
++#define ARG_IC_ICTRL_ACTIVE1_STATUS_SET                0x00000008
++#define ARG_IC_ICTRL_ACTIVE2_INT_SET                   0x00000010
++#define ARG_IC_ICTRL_ACTIVE2_EDGE_SET                  0x00000020
++#define ARG_IC_ICTRL_ACTIVE2_EN_SET                    0x00000040
++#define ARG_IC_ICTRL_ACTIVE2_STATUS_SET                0x00000080
++
++static inline void int_wait(const RPI_T * const rpi, const unsigned int phase)
++{
++    const uint32_t mask_reset = phase == 1 ? ~ARG_IC_ICTRL_ACTIVE2_INT_SET : ~ARG_IC_ICTRL_ACTIVE1_INT_SET;
++    const uint32_t mask_done = phase == 1 ? ARG_IC_ICTRL_ACTIVE1_INT_SET : ARG_IC_ICTRL_ACTIVE2_INT_SET;
++    uint32_t ival;
++    while (((ival = rpi->ints[0]) & mask_done) == 0) {
++        usleep(1000);
++    }
++    rpi->ints[0] = ival & mask_reset;
++}
++
++#if TRACE_DEV && 0
++static void apb_dump_regs(const RPI_T * const rpi, uint16_t addr, int num) {
++    int i;
++
++    for (i=0; i<num; i++)
++    {
++        if ((i%4)==0)
++          printf("%08x: ", 0x7eb00000 + addr + 4*i);
++
++        printf("%08x", rpi->regs[(addr>>2)+i]);
++
++        if ((i%4)==3 || i+1 == num)
++            printf("\n");
++        else
++            printf(" ");
++    }
++}
++
++static void axi_dump(const dec_env_t * const de, uint64_t addr, uint32_t size) {
++    int i;
++
++    for (i=0; i<size>>2; i++)
++    {
++        if ((i%4)==0)
++            printf("%08x: ", MANGLE(de->gbuf.vc) + (uint32_t)addr + 4*i);
++
++        printf("%08x", ((uint32_t*)de->gbuf.arm)[(addr>>2)+i]);
++
++        if ((i%4)==3 || i+1 == size>>2)
++            printf("\n");
++        else
++            printf(" ");
++    }
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static inline size_t round_up_size(const size_t x)
++{
++    /* Admit no size < 256 */
++    const unsigned int n = x < 256 ? 8 : av_log2(x) - 1;
++
++    return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Scaling factors
++
++static void expand_scaling_list(
++    const unsigned int sizeID,
++    const unsigned int matrixID,
++    uint8_t * const dst0,
++    const uint8_t * const src0,
++    uint8_t dc)
++{
++    switch (sizeID) {
++        case 0:
++            memcpy(dst0, src0, 16);
++            break;
++        case 1:
++            memcpy(dst0, src0, 64);
++            break;
++        case 2:
++        {
++            uint8_t * d = dst0;
++            for (unsigned int y=0; y != 16; y++) {
++                const uint8_t * s = src0 + (y >> 1) * 8;
++                for (unsigned int x = 0; x != 8; ++x) {
++                    *d++ = *s;
++                    *d++ = *s++;
++                }
++            }
++            dst0[0] = dc;
++            break;
++        }
++        default:
++        {
++            uint8_t * d = dst0;
++            for (unsigned int y=0; y != 32; y++) {
++                const uint8_t * s = src0 + (y >> 2) * 8;
++                for (unsigned int x = 0; x != 8; ++x) {
++                    *d++ = *s;
++                    *d++ = *s;
++                    *d++ = *s;
++                    *d++ = *s++;
++                }
++            }
++            dst0[0] = dc;
++            break;
++        }
++    }
++}
++
++static void populate_scaling_factors(dec_env_t * const de, const HEVCContext * const s) {
++    // Array of constants for scaling factors
++    static const uint32_t scaling_factor_offsets[4][6] = {
++        // MID0    MID1    MID2    MID3    MID4    MID5
++        {0x0000, 0x0010, 0x0020, 0x0030, 0x0040, 0x0050},   // SID0 (4x4)
++        {0x0060, 0x00A0, 0x00E0, 0x0120, 0x0160, 0x01A0},   // SID1 (8x8)
++        {0x01E0, 0x02E0, 0x03E0, 0x04E0, 0x05E0, 0x06E0},   // SID2 (16x16)
++        {0x07E0,      0,      0, 0x0BE0,      0,      0}};  // SID3 (32x32)
++
++    // ffmpeg places SID3,MID1 where matrixID 3 normally is
++    const ScalingList * const sl =
++        s->ps.pps->scaling_list_data_present_flag ? &s->ps.pps->scaling_list
++                                                  : &s->ps.sps->scaling_list;
++    unsigned int mid;
++
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(0, mid,
++            de->scaling_factors + scaling_factor_offsets[0][mid],
++            sl->sl[0][mid], 0);
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(1, mid,
++            de->scaling_factors + scaling_factor_offsets[1][mid],
++            sl->sl[1][mid], 0);
++    for (mid=0; mid<6; mid++)
++        expand_scaling_list(2, mid,
++            de->scaling_factors + scaling_factor_offsets[2][mid],
++            sl->sl[2][mid],
++            sl->sl_dc[0][mid]);
++    // second scaling matrix for 32x32 is at matrixID 3 not 1 in ffmpeg
++    for (mid=0; mid<6; mid += 3)
++        expand_scaling_list(3, mid,
++            de->scaling_factors + scaling_factor_offsets[3][mid],
++            sl->sl[3][mid],
++            sl->sl_dc[1][mid]);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Probabilities
++
++static const uint8_t prob_init[3][156] = {
++	{
++		 153, 200, 139, 141, 157, 154, 154, 154,
++		 154, 154, 184, 154, 154, 154, 184,  63,
++		 154, 154, 154, 154, 154, 154, 154, 154,
++		 154, 154, 154, 154, 154, 153, 138, 138,
++		 111, 141,  94, 138, 182, 154, 154, 154,
++		 140,  92, 137, 138, 140, 152, 138, 139,
++		 153,  74, 149,  92, 139, 107, 122, 152,
++		 140, 179, 166, 182, 140, 227, 122, 197,
++		 110, 110, 124, 125, 140, 153, 125, 127,
++		 140, 109, 111, 143, 127, 111,  79, 108,
++		 123,  63, 110, 110, 124, 125, 140, 153,
++		 125, 127, 140, 109, 111, 143, 127, 111,
++		  79, 108, 123,  63,  91, 171, 134, 141,
++		 138, 153, 136, 167, 152, 152, 139, 139,
++		 111, 111, 125, 110, 110,  94, 124, 108,
++		 124, 107, 125, 141, 179, 153, 125, 107,
++		 125, 141, 179, 153, 125, 107, 125, 141,
++		 179, 153, 125, 140, 139, 182, 182, 152,
++		 136, 152, 136, 153, 136, 139, 111, 136,
++		 139, 111,   0,   0,	},
++	{
++		 153, 185, 107, 139, 126, 197, 185, 201,
++		 154, 149, 154, 139, 154, 154, 154, 152,
++		 110, 122,  95,  79,  63,  31,  31, 153,
++		 153, 168, 140, 198,  79, 124, 138,  94,
++		 153, 111, 149, 107, 167, 154, 154, 154,
++		 154, 196, 196, 167, 154, 152, 167, 182,
++		 182, 134, 149, 136, 153, 121, 136, 137,
++		 169, 194, 166, 167, 154, 167, 137, 182,
++		 125, 110,  94, 110,  95,  79, 125, 111,
++		 110,  78, 110, 111, 111,  95,  94, 108,
++		 123, 108, 125, 110,  94, 110,  95,  79,
++		 125, 111, 110,  78, 110, 111, 111,  95,
++		  94, 108, 123, 108, 121, 140,  61, 154,
++		 107, 167,  91, 122, 107, 167, 139, 139,
++		 155, 154, 139, 153, 139, 123, 123,  63,
++		 153, 166, 183, 140, 136, 153, 154, 166,
++		 183, 140, 136, 153, 154, 166, 183, 140,
++		 136, 153, 154, 170, 153, 123, 123, 107,
++		 121, 107, 121, 167, 151, 183, 140, 151,
++		 183, 140,   0,   0,	},
++	{
++		 153, 160, 107, 139, 126, 197, 185, 201,
++		 154, 134, 154, 139, 154, 154, 183, 152,
++		 154, 137,  95,  79,  63,  31,  31, 153,
++		 153, 168, 169, 198,  79, 224, 167, 122,
++		 153, 111, 149,  92, 167, 154, 154, 154,
++		 154, 196, 167, 167, 154, 152, 167, 182,
++		 182, 134, 149, 136, 153, 121, 136, 122,
++		 169, 208, 166, 167, 154, 152, 167, 182,
++		 125, 110, 124, 110,  95,  94, 125, 111,
++		 111,  79, 125, 126, 111, 111,  79, 108,
++		 123,  93, 125, 110, 124, 110,  95,  94,
++		 125, 111, 111,  79, 125, 126, 111, 111,
++		  79, 108, 123,  93, 121, 140,  61, 154,
++		 107, 167,  91, 107, 107, 167, 139, 139,
++		 170, 154, 139, 153, 139, 123, 123,  63,
++		 124, 166, 183, 140, 136, 153, 154, 166,
++		 183, 140, 136, 153, 154, 166, 183, 140,
++		 136, 153, 154, 170, 153, 138, 138, 122,
++		 121, 122, 121, 167, 151, 183, 140, 151,
++		 183, 140,   0,   0,	},
++};
++
++
++//////////////////////////////////////////////////////////////////////////////
++// Phase 1 command and bit FIFOs
++
++// ???? uint16_t addr - put in uint32_t
++static int p1_apb_write(dec_env_t * const de, const uint16_t addr, const uint32_t data) {
++    if (de->cmd_len==de->cmd_max)
++        av_assert0(de->cmd_fifo = realloc(de->cmd_fifo, (de->cmd_max*=2)*sizeof(struct RPI_CMD)));
++
++#if TRACE_DEV
++    printf("[%02x] %x %x\n", de->cmd_len, addr, data);
++#endif
++
++    de->cmd_fifo[de->cmd_len].addr = addr;
++    de->cmd_fifo[de->cmd_len].data = data;
++    return de->cmd_len++;
++}
++
++static void p1_axi_write(dec_env_t * const de, const uint32_t len, const void * const ptr, const int cmd_idx) {
++    if (de->bit_len==de->bit_max)
++        av_assert0(de->bit_fifo = realloc(de->bit_fifo, (de->bit_max*=2)*sizeof(struct RPI_BIT)));
++    de->bit_fifo[de->bit_len].cmd = cmd_idx;
++    de->bit_fifo[de->bit_len].ptr = ptr;
++    de->bit_fifo[de->bit_len].len = len;
++    de->bit_len++;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write probability and scaling factor memories
++
++#if 0
++static void WriteProb(dec_env_t * const de) {
++    int i;
++    const uint8_t *p = (uint8_t *) &de->probabilities;
++    for (i=0; i<sizeof(struct RPI_PROB); i+=4, p+=4)
++        p1_apb_write(de, 0x1000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++#endif
++
++static void WriteProb(dec_env_t * const de, const HEVCContext * const s) {
++    uint8_t dst[RPI_PROB_ARRAY_SIZE];
++
++    const unsigned int init_type = (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I) ?
++        s->sh.slice_type + 1 : 2 - s->sh.slice_type;
++    const uint8_t * p = prob_init[init_type];
++    const int q = av_clip(s->sh.slice_qp, 0, 51);
++    unsigned int i;
++
++    for (i = 0; i < RPI_PROB_VALS; i++) {
++        int init_value = p[i];
++        int m = (init_value >> 4) * 5 - 45;
++        int n = ((init_value & 15) << 3) - 16;
++        int pre = 2 * (((m * q) >> 4) + n) - 127;
++
++        pre ^= pre >> 31;
++        if (pre > 124)
++            pre = 124 + (pre & 1);
++        dst[i] = pre;
++    }
++    for (i = RPI_PROB_VALS; i != RPI_PROB_ARRAY_SIZE; ++i) {
++        dst[i] = 0;
++    }
++
++    for (i=0; i < RPI_PROB_ARRAY_SIZE; i+=4)
++        p1_apb_write(de, 0x1000+i, dst[i] + (dst[i+1]<<8) + (dst[i+2]<<16) + (dst[i+3]<<24));
++
++}
++
++
++static void WriteScalingFactors(dec_env_t * const de) {
++    int i;
++    const uint8_t *p = (uint8_t *) de->scaling_factors;
++    for (i=0; i<NUM_SCALING_FACTORS; i+=4, p+=4)
++        p1_apb_write(de, 0x2000+i, p[0] + (p[1]<<8) + (p[2]<<16) + (p[3]<<24));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int ctb_to_tile (unsigned int ctb, unsigned int *bd, int num) {
++    int i;
++    for (i=1; ctb >= bd[i]; i++); // bd[] has num+1 elements; bd[0]=0; see hevc_ps.c
++    return i-1;
++}
++
++static int ctb_to_slice_w_h (unsigned int ctb, int ctb_size, int width, unsigned int *bd, int num) {
++    if (ctb < bd[num-1]) return ctb_size;
++    else if (width % ctb_size) return width % ctb_size;
++    else return ctb_size;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Handle PU and COEFF stream overflow
++
++
++// Returns:
++// -2 Other error
++// -1 Out of coeff space
++//  0  OK
++//  1  Out of PU space
++
++static int check_status(const RPI_T * const rpi, dec_env_t * const de) {
++    uint32_t status;
++
++    // this is the definition of successful completion of phase 1
++    // it assures that status register is zero and all blocks in each tile have completed
++    if (apb_read(rpi, RPI_CFSTATUS) == apb_read(rpi, RPI_CFNUM))
++        return 0;
++
++    status = apb_read(rpi, RPI_STATUS);
++
++    if ((status & 8) != 0)
++        return -1;
++
++    if ((status & 0x10) != 0)
++        return 1;
++
++    return -2;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Write STATUS register with expected end CTU address of previous slice
++
++static void end_previous_slice(dec_env_t * const de, const HEVCContext * const s, const int ctb_addr_ts) {
++    const HEVCPPS * const pps = s->ps.pps;
++    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++}
++
++static void wpp_pause(dec_env_t * const de, int ctb_row) {
++    p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + 0x25);
++    p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1 ? 0x70000 : 0x30000);
++    p1_apb_write(de, RPI_CONTROL, (ctb_row<<16) + 2);
++}
++
++static void wpp_end_previous_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++    const HEVCPPS *pps = s->ps.pps;
++    int new_x = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++    int new_y = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++    int last_x = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] % de->PicWidthInCtbsY;
++    int last_y = pps->ctb_addr_ts_to_rs[ctb_addr_ts-1] / de->PicWidthInCtbsY;
++    if (de->wpp_entry_x<2 && (de->wpp_entry_y<new_y || new_x>2) && de->PicWidthInCtbsY>2)
++        wpp_pause(de, last_y);
++    p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++    if (new_x==2 || de->PicWidthInCtbsY==2 && de->wpp_entry_y<new_y)
++        p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void new_slice_segment(dec_env_t * const de, const HEVCContext * const s)
++{
++    const HEVCSPS *sps = s->ps.sps;
++    const HEVCPPS *pps = s->ps.pps;
++
++    p1_apb_write(de, RPI_SPS0,
++        (sps->log2_min_cb_size                    <<  0) +
++        (sps->log2_ctb_size                       <<  4) +
++        (sps->log2_min_tb_size                    <<  8) +
++        (sps->log2_max_trafo_size                 << 12) +
++        (sps->bit_depth                           << 16) +
++        (sps->bit_depth                           << 20) +
++        (sps->max_transform_hierarchy_depth_intra << 24) +
++        (sps->max_transform_hierarchy_depth_inter << 28));
++
++    p1_apb_write(de, RPI_SPS1,
++        (sps->pcm.bit_depth                                        <<  0) +
++        (sps->pcm.bit_depth_chroma                                 <<  4) +
++        (sps->pcm.log2_min_pcm_cb_size                             <<  8) +
++        (sps->pcm.log2_max_pcm_cb_size                             << 12) +
++        (sps->separate_colour_plane_flag? 0:sps->chroma_format_idc << 16) +
++        (sps->amp_enabled_flag                                     << 18) +
++        (sps->pcm_enabled_flag                                     << 19) +
++        (sps->scaling_list_enable_flag                             << 20) +
++        (sps->sps_strong_intra_smoothing_enable_flag               << 21));
++
++    p1_apb_write(de, RPI_PPS,
++        (sps->log2_ctb_size - pps->diff_cu_qp_delta_depth   <<  0) +
++        (pps->cu_qp_delta_enabled_flag                      <<  4) +
++        (pps->transquant_bypass_enable_flag                 <<  5) +
++        (pps->transform_skip_enabled_flag                   <<  6) +
++        (pps->sign_data_hiding_flag                         <<  7) +
++      (((pps->cb_qp_offset + s->sh.slice_cb_qp_offset)&255) <<  8) +
++      (((pps->cr_qp_offset + s->sh.slice_cr_qp_offset)&255) << 16) +
++        (pps->constrained_intra_pred_flag                   << 24));
++
++    if (s->ps.sps->scaling_list_enable_flag) WriteScalingFactors(de);
++
++    if (!s->sh.dependent_slice_segment_flag) {
++        int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++        int ctb_row = s->sh.slice_ctb_addr_rs / de->PicWidthInCtbsY;
++        de->reg_slicestart = (ctb_col<<0) + (ctb_row<<16);
++    }
++
++    p1_apb_write(de, RPI_SLICESTART, de->reg_slicestart);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void write_slice(dec_env_t * const de, const HEVCContext * const s,
++                        const unsigned int slice_w, const unsigned int slice_h) {
++    uint32_t u32 =
++          (s->sh.slice_type                           << 12)
++        + (s->sh.slice_sample_adaptive_offset_flag[0] << 14)
++        + (s->sh.slice_sample_adaptive_offset_flag[1] << 15)
++        + (slice_w                                    << 17)
++        + (slice_h                                    << 24);
++
++    if (s->sh.slice_type==HEVC_SLICE_B || s->sh.slice_type==HEVC_SLICE_P) u32 |=
++          (s->sh.max_num_merge_cand << 0)
++        + (s->sh.nb_refs[L0]        << 4)
++        + (s->sh.nb_refs[L1]        << 8);
++
++    if (s->sh.slice_type==HEVC_SLICE_B)
++        u32 |= s->sh.mvd_l1_zero_flag<<16;
++    p1_apb_write(de, RPI_SLICE, u32);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_entry_point(dec_env_t * const de, const HEVCContext * const s,
++                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int ctb_size = 1<<sps->log2_ctb_size;
++    int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++
++    int ctb_col = de->wpp_entry_x = ctb_addr_rs % de->PicWidthInCtbsY;
++    int ctb_row = de->wpp_entry_y = ctb_addr_rs / de->PicWidthInCtbsY;
++
++    int endx = de->PicWidthInCtbsY-1;
++    int endy = ctb_row;
++
++    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
++    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++    p1_apb_write(de, RPI_TILESTART, 0);
++    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++    if (do_bte)
++        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++    write_slice(de, s, slice_w, ctb_row==de->PicHeightInCtbsY-1? slice_h : ctb_size);
++
++    if (resetQPY) p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++    p1_apb_write(de, RPI_MODE, ctb_row==de->PicHeightInCtbsY-1? 0x60001 : 0x20001);
++    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void new_entry_point(dec_env_t * const de, const HEVCContext * const s,
++                            const int do_bte, const int resetQPY, const int ctb_addr_ts) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int ctb_col = pps->ctb_addr_ts_to_rs[ctb_addr_ts] % de->PicWidthInCtbsY;
++    int ctb_row = pps->ctb_addr_ts_to_rs[ctb_addr_ts] / de->PicWidthInCtbsY;
++
++    int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++    int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++
++    int endx = pps->col_bd[tile_x+1] - 1;
++    int endy = pps->row_bd[tile_y+1] - 1;
++
++    uint8_t slice_w = ctb_to_slice_w_h(ctb_col, 1<<sps->log2_ctb_size, sps->width,  pps->col_bd, pps->num_tile_columns);
++    uint8_t slice_h = ctb_to_slice_w_h(ctb_row, 1<<sps->log2_ctb_size, sps->height, pps->row_bd, pps->num_tile_rows);
++
++    p1_apb_write(de, RPI_TILESTART, pps->col_bd[tile_x] + (pps->row_bd[tile_y]<<16));
++    p1_apb_write(de, RPI_TILEEND, endx + (endy<<16));
++
++    if (do_bte)
++        p1_apb_write(de, RPI_BEGINTILEEND, endx + (endy<<16));
++
++    write_slice(de, s, slice_w, slice_h);
++
++    if (resetQPY)
++        p1_apb_write(de, RPI_QP, sps->qp_bd_offset + s->sh.slice_qp);
++
++    p1_apb_write(de, RPI_MODE, (0xFFFF                            <<  0)
++                              + (0x0                               << 16)
++                              + ((tile_x==pps->num_tile_columns-1) << 17)
++                              + ((tile_y==pps->num_tile_rows-1)    << 18));
++
++    p1_apb_write(de, RPI_CONTROL, (ctb_col<<0) + (ctb_row<<16));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++// Doesn't attempt to remove from context as we should only do this at the end
++// of time or on create error
++static void
++dec_env_delete(dec_env_t * const de)
++{
++//    gpu_free(&de->gbuf);
++
++    av_freep(&de->cmd_fifo);
++    av_freep(&de->bit_fifo);
++
++    sem_destroy(&de->phase_wait);
++    av_free(de);
++}
++
++static dec_env_t *
++dec_env_new(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++    dec_env_t * const de = av_mallocz(sizeof(*de));
++    int i;
++
++    if (de == NULL)
++        return NULL;
++
++    de->avctx = avctx;
++    de->phase_no = RPIVID_PHASE_NEW;
++
++    sem_init(&de->phase_wait, 0, 0);
++
++    if ((de->cmd_fifo = malloc((de->cmd_max=1024)*sizeof(struct RPI_CMD))) == NULL)
++        goto fail;
++
++    if ((de->bit_fifo = malloc((de->bit_max=1024)*sizeof(struct RPI_BIT))) == NULL)
++        goto fail;
++
++    pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++    for (i = 0; i != avctx->thread_count; ++i) {
++        if (rpi->dec_envs[i] == NULL)
++        {
++            rpi->dec_envs[i] = de;
++            break;
++        }
++    }
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (i == avctx->thread_count) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to find a slot for hw thread context\n");
++        goto fail;
++    }
++
++    return de;
++
++fail:
++    dec_env_delete(de);
++    return NULL;
++}
++
++
++static dec_env_t *
++dec_env_get(AVCodecContext * const avctx, RPI_T * const rpi)
++{
++    dec_env_t * de = NULL;
++    const int ref_count = atomic_fetch_add(&rpi->ref_count, 1);
++
++    if (ref_count <= 0) {
++        // Already dead
++        av_log(avctx, AV_LOG_ERROR, "RPIVID called whilst dead\n");;
++        return NULL;
++    }
++
++    for (int i = 0; i != avctx->thread_count; ++i) {
++        if (rpi->dec_envs[i] == NULL)
++        {
++            de = dec_env_new(avctx, rpi);
++            break;
++        }
++        if (rpi->dec_envs[i]->avctx == avctx)
++        {
++            de = rpi->dec_envs[i];
++            break;
++        }
++    }
++    return de;
++}
++
++// Call at end of fn
++// Used to ensure we aren't in a worker thead when killed
++static void
++dec_env_release(RPI_T * const rpi, dec_env_t * const de)
++{
++    const int n = atomic_fetch_sub(&rpi->ref_count, 1);
++    if (n == 1) {
++        sem_post(&rpi->ref_zero);
++    }
++}
++
++//----------------------------------------------------------------------------
++
++// Wait for a slot in the given phase
++// Any error return is probably fatal
++static int
++wait_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++    int needs_wait = 0;
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++
++    pthread_mutex_lock(&rpi->phase_lock);
++    if (p->last_order + 1 != de->decode_order) {
++        de->phase_wait_q_next = p->q;
++        p->q = de;
++        needs_wait = 1;
++    }
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (needs_wait) {
++        while (sem_wait(&de->phase_wait) == -1)
++        {
++            int err;
++            if ((err = errno) != EINTR)
++                return AVERROR(err);
++        }
++    }
++
++    de->phase_no = phase_no;
++    return 0;
++}
++
++static void
++post_phase(RPI_T * const rpi, dec_env_t * const de, const int phase_no)
++{
++    dec_env_t * next_de = NULL;
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    dec_env_t ** q = &p->q;
++
++    pthread_mutex_lock(&rpi->phase_lock);
++
++    p->last_order = de->decode_order;
++    while (*q != NULL) {
++        dec_env_t * const t_de = *q;
++
++        if (t_de->decode_order == p->last_order + 1) {
++            // This is us - remove from Q
++            *q = t_de->phase_wait_q_next;
++            t_de->phase_wait_q_next = NULL; // Tidy
++            next_de = t_de;
++            break;
++        }
++        q = &t_de->phase_wait_q_next;
++    }
++
++    pthread_mutex_unlock(&rpi->phase_lock);
++
++    if (next_de != NULL)
++        sem_post(&next_de->phase_wait);
++}
++
++// Wait & signal stuff s.t. threads in other phases can continue
++static void
++abort_phases(RPI_T * const rpi, dec_env_t * const de)
++{
++    for (int i = de->phase_no + 1; i < RPIVID_PHASE_NEW; ++i) {
++        wait_phase(rpi, de, i);
++        post_phase(rpi, de, i);
++    }
++    de->phase_no = RPIVID_PHASE_NEW;
++}
++
++// Start timing for phase
++// Stats only - no actual effect
++static inline void tstart_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    const int64_t now = tus64();
++    if (p->phase_time != 0)
++        p->time_out_phase += now - p->phase_time;
++    p->phase_time = now;
++#endif
++}
++
++#if OPT_PHASE_TIMING
++static unsigned int tavg_bin_phase(phase_wait_env_t *const p, const unsigned int avg_n)
++{
++    uint64_t tsum = 0;
++    unsigned int i;
++    for (i = 0; i != avg_n; ++i)
++        tsum += p->time_stash[(p->i3 - i) & 15];
++    for (i = 0; i != 9; ++i) {
++        if (time_thresholds[i] * 1000 * avg_n > tsum)
++            break;
++    }
++    return i;
++}
++#endif
++
++// End timing for phase
++// Stats only - no actual effect
++static inline void tend_phase(RPI_T * const rpi, const int phase_no)
++{
++#if OPT_PHASE_TIMING
++    phase_wait_env_t *const p = rpi->phase_reqs + phase_no;
++    const uint64_t now = tus64();
++    const uint64_t in_time = now - p->phase_time;
++
++    p->time_in_phase += in_time;
++    p->phase_time = now;
++    p->time_stash[p->i3] = in_time;
++    if (in_time > p->max_phase_time) {
++        p->max_phase_time = in_time;
++        p->max_time_decode_order = p->last_order;
++    }
++    ++p->time_bins[tavg_bin_phase(p, 1)];
++    ++p->time_bins3[tavg_bin_phase(p, 3)];
++    ++p->time_bins5[tavg_bin_phase(p, 5)];
++
++    p->i3 = (p->i3 + 1) & 15;
++#endif
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Start frame
++
++static int rpi_hevc_start_frame(
++    AVCodecContext * avctx,
++    const uint8_t *buffer,
++    uint32_t size) {
++
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    dec_env_t * const de = dec_env_get(avctx, rpi);
++    const HEVCContext * const s = avctx->priv_data;
++    const HEVCSPS * const sps = s->ps.sps;
++    const unsigned int CtbSizeY = 1U << sps->log2_ctb_size;
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return -1;
++    }
++
++    de->phase_no = RPIVID_PHASE_START;
++    de->decode_order = ++rpi->decode_order;  // *** atomic?
++
++    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++    if (de->state != RPIVID_DECODE_NEW && de->state != RPIVID_DECODE_END) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++        return -1;
++    }
++    de->state = RPIVID_DECODE_START;
++
++    de->PicWidthInCtbsY  = (sps->width + CtbSizeY - 1) / CtbSizeY;  //7-15
++    de->PicHeightInCtbsY = (sps->height + CtbSizeY - 1) / CtbSizeY;  //7-17
++    de->bit_len = 0;
++    de->cmd_len = 0;
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p]\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Slice messages
++
++static void msg_slice(dec_env_t * const de, const uint16_t msg) {
++    de->slice_msgs[de->num_slice_msgs++] = msg;
++}
++
++static void program_slicecmds(dec_env_t * const de, const int sliceid) {
++    int i;
++    p1_apb_write(de, RPI_SLICECMDS, de->num_slice_msgs+(sliceid<<8));
++    for(i=0; i < de->num_slice_msgs; i++) {
++        p1_apb_write(de, 0x4000+4*i, de->slice_msgs[i] & 0xffff);
++    }
++}
++
++static void pre_slice_decode(dec_env_t * const de, const HEVCContext * const s) {
++    const HEVCSPS * const sps = s->ps.sps;
++    const HEVCPPS * const pps = s->ps.pps;
++    const SliceHeader *sh = &s->sh;
++
++    int weightedPredFlag, i, rIdx;
++    uint16_t cmd_slice;
++    unsigned int collocated_from_l0_flag;
++
++    de->num_slice_msgs=0;
++    de->dpbno_col = 0;
++    cmd_slice = 0;
++    if (sh->slice_type==HEVC_SLICE_I) cmd_slice = 1;
++    if (sh->slice_type==HEVC_SLICE_P) cmd_slice = 2;
++    if (sh->slice_type==HEVC_SLICE_B) cmd_slice = 3;
++
++    if (sh->slice_type!=HEVC_SLICE_I) {
++        cmd_slice += sh->nb_refs[L0]<<2;
++        cmd_slice += sh->nb_refs[L1]<<6;
++    }
++
++    if (sh->slice_type==HEVC_SLICE_P ||  sh->slice_type==HEVC_SLICE_B)
++        cmd_slice |= sh->max_num_merge_cand<<11;
++
++    collocated_from_l0_flag =
++        !sh->slice_temporal_mvp_enabled_flag ?
++            0 :
++        sh->slice_type == HEVC_SLICE_B ?
++            (sh->collocated_list == L0) :
++            (sh->slice_type==HEVC_SLICE_P);
++    cmd_slice |= collocated_from_l0_flag<<14;
++
++    if (sh->slice_type==HEVC_SLICE_P || sh->slice_type==HEVC_SLICE_B) {
++
++        int NoBackwardPredFlag = 1; // Flag to say all reference pictures are from the past
++        for(i=L0; i<=L1; i++) {
++            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++                HEVCFrame *c = s->ref; // CurrentPicture
++                if (c->poc < f->poc) NoBackwardPredFlag = 0;
++            }
++        }
++
++        if (sps->sps_temporal_mvp_enabled_flag)
++        {
++            const RefPicList *rpl = (sh->slice_type != HEVC_SLICE_B || collocated_from_l0_flag) ?
++                s->ref->refPicList + 0 :
++                s->ref->refPicList + 1;
++            de->dpbno_col = rpl->ref[sh->collocated_ref_idx] - s->DPB;
++        }
++
++        cmd_slice += NoBackwardPredFlag<<10;
++        msg_slice(de, cmd_slice);
++
++        // Write reference picture descriptions
++        weightedPredFlag = sh->slice_type==HEVC_SLICE_P? pps->weighted_pred_flag : pps->weighted_bipred_flag;
++
++        for(i=L0; i<=L1; i++)
++            for(rIdx=0; rIdx <sh->nb_refs[i]; rIdx++) {
++                HEVCFrame *f = s->ref->refPicList[i].ref[rIdx];
++                HEVCFrame *c = s->ref; // CurrentPicture
++                int pic = f - s->DPB;
++                // Make sure pictures are in range 0 to 15
++                int adjusted_pic = f<c? pic : pic-1;
++                int lt = s->ref->refPicList[i].isLongTerm[rIdx];
++                msg_slice(de, adjusted_pic+(lt<<4)+(weightedPredFlag<<5)+(weightedPredFlag<<6));
++                msg_slice(de, f->poc);
++                if (weightedPredFlag) {
++                    msg_slice(de,   s->sh.luma_log2_weight_denom+(((i?s->  sh.luma_weight_l1:  s->sh.luma_weight_l0)[rIdx]   &0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->  sh.luma_offset_l1:  s->sh.luma_offset_l0)[rIdx]   & 0xff);
++                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][0]&0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][0]& 0xff);
++                    msg_slice(de, s->sh.chroma_log2_weight_denom+(((i?s->sh.chroma_weight_l1:s->sh.chroma_weight_l0)[rIdx][1]&0x1ff)<<3));
++                    msg_slice(de,                                  (i?s->sh.chroma_offset_l1:s->sh.chroma_offset_l0)[rIdx][1]& 0xff);
++                }
++            }
++    }
++    else
++        msg_slice(de, cmd_slice);
++
++    msg_slice(de, ((sh->beta_offset/2)&15)
++        + (((sh->tc_offset/2)&15)                           <<  4)
++        + (sh->disable_deblocking_filter_flag               <<  8)
++        + (sh->slice_loop_filter_across_slices_enabled_flag <<  9)
++        + (pps->loop_filter_across_tiles_enabled_flag       << 10)); // CMD_DEBLOCK
++
++    msg_slice(de, ((sh->slice_cr_qp_offset&31)<<5) + (sh->slice_cb_qp_offset&31)); // CMD_QPOFF
++}
++
++
++//////////////////////////////////////////////////////////////////////////////
++
++static void rpi_hevc_abort_frame(AVCodecContext * const avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    dec_env_t * const de = dec_env_get(avctx,  rpi);
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return;
++    }
++
++    switch (de->state) {
++        case RPIVID_DECODE_NEW:
++        case RPIVID_DECODE_END:
++            // Expected transition
++            break;
++
++        case RPIVID_DECODE_SLICE:
++            // Error transition
++            av_log(avctx, AV_LOG_INFO, "Error in decode - aborting\n");
++            break;
++
++        case RPIVID_DECODE_START:
++        default:
++            av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state transition: %d", __func__, de->state);
++            break;
++    }
++
++    abort_phases(rpi, de);
++    de->state = RPIVID_DECODE_NEW;
++
++    dec_env_release(rpi, de);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// End frame
++
++static int rpi_hevc_end_frame(AVCodecContext * const avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    const HEVCContext * const s = avctx->priv_data;
++    const HEVCPPS * const pps = s->ps.pps;
++    const HEVCSPS * const sps = s->ps.sps;
++    dec_env_t * const de = dec_env_get(avctx,  rpi);
++    AVFrame * const f = s->ref->frame;
++    const unsigned int dpbno_cur = s->ref - s->DPB;
++    vid_vc_addr_t cmds_vc;
++    vid_vc_addr_t pu_base_vc;
++    unsigned int pu_stride;
++    vid_vc_addr_t coeff_base_vc;
++    unsigned int coeff_stride;
++    unsigned int i;
++    int rv = 0;
++    int status = 0;
++    int coeffbuf_sem_claimed = 0;
++
++#if TRACE_ENTRY
++    fprintf("<<< %s[%p]\n", __func__, de);
++#endif
++
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return AVERROR_BUG;  // Should never happen
++    }
++
++    if (de->state != RPIVID_DECODE_SLICE) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++        rv = AVERROR_UNKNOWN;
++        goto fail;
++    }
++    de->state = RPIVID_DECODE_END;
++
++    // End of command compilation
++    {
++        const unsigned int last_x = pps->col_bd[pps->num_tile_columns]-1;
++        const unsigned int last_y = pps->row_bd[pps->num_tile_rows]-1;
++        if (pps->entropy_coding_sync_enabled_flag) {
++            if (de->wpp_entry_x<2 && de->PicWidthInCtbsY>2)
++                wpp_pause(de, last_y);
++        }
++        p1_apb_write(de, RPI_STATUS, 1 + (last_x<<5) + (last_y<<18));
++    }
++
++    // Phase 0 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 0);
++    rpi_sem_wait(&rpi->bitbuf_sem);
++    tstart_phase(rpi, 0);
++
++    // Copy cmds & bits into gpu side buffer
++    // Layout: CMDS, BITS
++    {
++        uint8_t * const armbase = rpi->gbitbufs[rpi->bitbuf_no].arm;
++        vid_vc_addr_t vcbase = rpi->gbitbufs[rpi->bitbuf_no].vc;
++        unsigned int cmd_bytes = de->cmd_len * sizeof(struct RPI_CMD);
++
++        uint8_t * p = armbase + rnd64(cmd_bytes);
++        uint8_t * const eobits = armbase + rpi->gbitbufs[rpi->bitbuf_no].numbytes;
++
++        cmds_vc = vcbase;
++
++        // Copy all the bits & update bitstream cmds to point at the right bits
++        for (i = 0; i < de->bit_len; ++i)
++        {
++            const unsigned int seg_len = de->bit_fifo[i].len;
++
++            if (p + seg_len > eobits) {
++                status = -1;
++                break;
++            }
++
++            memcpy(p, de->bit_fifo[i].ptr, seg_len);
++            de->cmd_fifo[de->bit_fifo[i].cmd].data = MANGLE64((p - armbase) + vcbase);
++
++            p += rnd64(seg_len);
++        }
++
++        memcpy(armbase, de->cmd_fifo, cmd_bytes);
++    }
++
++    if (status == 0)
++    {
++        if (++rpi->bitbuf_no >= RPIVID_BITBUFS)
++            rpi->bitbuf_no = 0;
++    }
++    else
++    {
++        sem_post(&rpi->bitbuf_sem);
++        av_log(avctx, AV_LOG_ERROR, "Out of HEVC bit/cmd memory\n");
++        rv = AVERROR_BUFFER_TOO_SMALL;
++    }
++
++    tend_phase(rpi, 0);
++    post_phase(rpi, de, 0);
++
++    if (status < 0)
++        goto fail;
++
++    // Phase 1 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 1);
++    rpi_sem_wait(&rpi->coeffbuf_sem);
++    coeffbuf_sem_claimed = 1;
++    tstart_phase(rpi, 1);
++
++    status = 0;
++    for (;;)
++    {
++        // (Re-)allocate PU/COEFF stream space
++        const unsigned int total_size = rpi->gcoeffbufs[rpi->coeffbuf_no].numbytes;
++        unsigned int pu_size;
++
++        pu_base_vc = rpi->gcoeffbufs[rpi->coeffbuf_no].vc;
++        pu_stride = rnd64(rpi->max_pu_msgs * 2 * de->PicWidthInCtbsY);
++        pu_size = pu_stride * de->PicHeightInCtbsY;
++
++        if (pu_size >= total_size || status == -1) {
++            GPU_MEM_PTR_T newbuf;
++
++            if (gpu_malloc_uncached(round_up_size(total_size + 1), &newbuf) != 0)
++            {
++                av_log(avctx, AV_LOG_ERROR, "Failed to reallocate coeffbuf\n");
++                status = -1;
++                break;
++            }
++            gpu_free(rpi->gcoeffbufs + rpi->coeffbuf_no);
++            rpi->gcoeffbufs[rpi->coeffbuf_no] = newbuf;
++            status = 0;
++            continue;
++        }
++
++        // Allocate all remaining space to coeff
++        coeff_base_vc = pu_base_vc + pu_size;
++        coeff_stride = ((total_size - pu_size) / de->PicHeightInCtbsY) & ~63;  // Round down to multiple of 64
++
++        apb_write_vc_addr(rpi, RPI_PUWBASE, pu_base_vc);
++        apb_write_vc_len(rpi, RPI_PUWSTRIDE, pu_stride);
++        apb_write_vc_addr(rpi, RPI_COEFFWBASE, coeff_base_vc);
++        apb_write_vc_len(rpi, RPI_COEFFWSTRIDE, coeff_stride);
++
++        // Trigger command FIFO
++        apb_write(rpi, RPI_CFNUM, de->cmd_len);
++#if TRACE_DEV && 0
++        apb_dump_regs(rpi, 0x0, 32);
++        apb_dump_regs(rpi, 0x8000, 24);
++        axi_dump(de, ((uint64_t)a64)<<6, de->cmd_len * sizeof(struct RPI_CMD));
++#endif
++        apb_write_vc_addr(rpi, RPI_CFBASE, cmds_vc);
++
++        int_wait(rpi, 1);
++
++        status = check_status(rpi, de);
++
++        if (status == -1)
++            continue;
++        else if (status != 1)
++            break;
++
++        // Status 1 means out of PU space so try again with more
++        // If we ran out of Coeff space then we are out of memory - we could possibly realloc?
++        rpi->max_pu_msgs += rpi->max_pu_msgs / 2;
++    }
++
++    // Inc inside the phase 1 lock, but only inc if we succeeded otherwise we
++    // may reuse a live buffer when we kick the coeff sem
++    if (status == 0)
++    {
++        if (++rpi->coeffbuf_no >= RPIVID_COEFFBUFS)
++            rpi->coeffbuf_no = 0;
++    }
++    else
++    {
++        if (status == -1)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Out of pu + coeff intermediate memory: pus=%d\n", rpi->max_pu_msgs);
++            rv = AVERROR_BUFFER_TOO_SMALL;
++        }
++        else
++        {
++            av_log(avctx, AV_LOG_WARNING, "Phase 1 decode error\n");
++            rv = AVERROR_INVALIDDATA;
++        }
++    }
++
++    tend_phase(rpi, 1);
++    sem_post(&rpi->bitbuf_sem);
++    post_phase(rpi, de, 1);
++
++    if (status != 0)
++        goto fail;
++
++    // Phase 2 ---------------------------------------------------------------
++
++    wait_phase(rpi, de, 2);
++
++    if ((rv = av_rpi_zc_resolve_frame(f, ZC_RESOLVE_ALLOC)) != 0)
++    {
++        // As we are in phase 2 already here we don't need to worry about
++        // ceoffbuf_no despite the early exit
++        post_phase(rpi, de, 2);
++        av_log(avctx, AV_LOG_ERROR, "Failed to allocate output frame\n");
++        goto fail;
++    }
++
++    tstart_phase(rpi, 2);
++
++    apb_write_vc_addr(rpi, RPI_PURBASE, pu_base_vc);
++    apb_write_vc_len(rpi, RPI_PURSTRIDE, pu_stride);
++    apb_write_vc_addr(rpi, RPI_COEFFRBASE, coeff_base_vc);
++    apb_write_vc_len(rpi, RPI_COEFFRSTRIDE, coeff_stride);
++
++    apb_write_vc_addr(rpi, RPI_OUTYBASE, get_vc_address_y(f));
++    apb_write_vc_addr(rpi, RPI_OUTCBASE, get_vc_address_u(f));
++    apb_write_vc_len(rpi, RPI_OUTYSTRIDE, f->linesize[3] * 128);
++    apb_write_vc_len(rpi, RPI_OUTCSTRIDE, f->linesize[3] * 128);
++
++    // Keep the last thing we resolved as fallback for any ref we fail to
++    // resolve.  As a final fallback use our current frame.  The pels might
++    // not be there yet but at least the memory is valid.
++    //
++    // Attempt to resolve the entire DPB - we could note what we have used
++    // in ref lists but probably simpler and more reliable to set the whole thing
++    {
++        AVFrame * fallback_frame = f;
++        for (i = 0; i != 16; ++i) {
++            // Avoid current frame
++            const HEVCFrame * hevc_fr = (s->DPB + i >= s->ref) ? s->DPB + i + 1 : s->DPB + i;
++            AVFrame * fr = hevc_fr->frame;
++
++            if (fr != NULL &&
++                av_rpi_zc_resolve_frame(fr, ZC_RESOLVE_FAIL) == 0)
++            {
++                fallback_frame = fr;
++            }
++            else
++            {
++                fr = fallback_frame;
++            }
++
++            apb_write_vc_addr(rpi, 0x9000+16*i, get_vc_address_y(fr));
++            apb_write(rpi, 0x9004+16*i, 0);
++            apb_write_vc_addr(rpi, 0x9008+16*i, get_vc_address_u(fr));
++            apb_write(rpi, 0x900C+16*i, 0);
++        }
++    }
++
++    apb_write(rpi, RPI_CONFIG2,
++          (sps->bit_depth                             << 0) // BitDepthY
++        + (sps->bit_depth                             << 4) // BitDepthC
++       + ((sps->bit_depth>8)                          << 8) // BitDepthY
++       + ((sps->bit_depth>8)                          << 9) // BitDepthC
++        + (sps->log2_ctb_size                         <<10)
++        + (pps->constrained_intra_pred_flag           <<13)
++        + (sps->sps_strong_intra_smoothing_enable_flag<<14)
++        + (sps->sps_temporal_mvp_enabled_flag         <<15)
++        + (pps->log2_parallel_merge_level             <<16)
++        + (s->sh.slice_temporal_mvp_enabled_flag      <<19)
++        + (sps->pcm.loop_filter_disable_flag          <<20)
++       + ((pps->cb_qp_offset&31)                      <<21)
++       + ((pps->cr_qp_offset&31)                      <<26));
++
++    apb_write(rpi, RPI_FRAMESIZE, (sps->height<<16) + sps->width);
++    apb_write(rpi, RPI_CURRPOC, s->poc);
++
++    // collocated reads/writes
++    if (sps->sps_temporal_mvp_enabled_flag) {
++        av_assert0(de->dpbno_col < RPIVID_COL_PICS);
++        av_assert0(dpbno_cur < RPIVID_COL_PICS);
++
++        apb_write_vc_len(rpi, RPI_COLSTRIDE, rpi->col_stride);
++        apb_write_vc_len(rpi, RPI_MVSTRIDE,  rpi->col_stride);
++        apb_write_vc_addr(rpi, RPI_MVBASE,  rpi->gcolbuf.vc + dpbno_cur * rpi->col_picsize);
++        apb_write_vc_addr(rpi, RPI_COLBASE, rpi->gcolbuf.vc + de->dpbno_col * rpi->col_picsize);
++    }
++
++#if TRACE_DEV && 0
++    apb_dump_regs(rpi, 0x0, 32);
++    apb_dump_regs(rpi, 0x8000, 24);
++#endif
++
++    apb_write(rpi, RPI_NUMROWS, de->PicHeightInCtbsY);
++    apb_read(rpi, RPI_NUMROWS); // Read back to confirm write has reached block
++
++    int_wait(rpi, 2);
++
++    tend_phase(rpi, 2);
++    coeffbuf_sem_claimed = 0;
++    sem_post(&rpi->coeffbuf_sem);
++    // Set valid here to avoid race in resolving in any pending phase 2
++    av_rpi_zc_set_valid_frame(f);
++
++    post_phase(rpi, de, 2);
++
++    // Flush frame for CPU access
++    // Arguably the best place would be at the start of phase 2 but here
++    // will overlap with the wait
++    //
++    // * Even better would be to have better lock/unlock control in ZC for external access
++    if (rpi->gpu_init_type == GPU_INIT_GPU)  // * CMA is currently always uncached
++    {
++        rpi_cache_buf_t cbuf;
++        rpi_cache_flush_env_t * const fe = rpi_cache_flush_init(&cbuf);
++        rpi_cache_flush_add_frame(fe, f, RPI_CACHE_FLUSH_MODE_INVALIDATE);
++        rpi_cache_flush_finish(fe);
++    }
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p] OK\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return 0;
++
++fail:
++    av_rpi_zc_set_broken_frame(f);
++    if (coeffbuf_sem_claimed)
++        sem_post(&rpi->coeffbuf_sem);
++    abort_phases(rpi, de);  // Dummy any unresolved phases
++
++#if TRACE_ENTRY
++    printf(">>> %s[%p] FAIL\n", __func__, de);
++#endif
++
++    dec_env_release(rpi, de);
++    return rv;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++
++#if TRACE_DEV
++static void dump_data(const uint8_t * p, size_t len)
++{
++    size_t i;
++    for (i = 0; i < len; i += 16) {
++        size_t j;
++        printf("%04x", i);
++        for (j = 0; j != 16; ++j) {
++            printf("%c%02x", i == 8 ? '-' : ' ', p[i+j]);
++        }
++        printf("\n");
++    }
++}
++#endif
++
++#if OPT_EMU
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++    unsigned int z = 0;
++    while (idx--) {
++        if (*b++ == 0) {
++            ++z;
++            if (z >= 2 && *b == 3) {
++                ++b;
++                z = 0;
++            }
++        }
++        else {
++            z = 0;
++        }
++    }
++    return b;
++}
++#endif
++
++static void WriteBitstream(dec_env_t * const de, const HEVCContext * const s) {
++    const int rpi_use_emu = OPT_EMU; // FFmpeg removes emulation prevention bytes
++    const int offset = 0; // Always 64-byte aligned in sim, need not be on real hardware
++    const GetBitContext *gb = &s->HEVClc->gb;
++
++#if OPT_EMU
++    const uint8_t *ptr = ptr_from_index(de->nal_buffer, gb->index/8 + 1);
++    const int len = de->nal_size - (ptr - de->nal_buffer);
++#else
++    const int len = 1 + gb->size_in_bits/8 - gb->index/8;
++    const void *ptr = &gb->buffer[gb->index/8];
++#endif
++
++#if TRACE_DEV
++    printf("Index=%d, /8=%#x\n", gb->index, gb->index/8);
++    dump_data(de->nal_buffer, 128);
++#endif
++
++    p1_axi_write(de, len, ptr, p1_apb_write(de, RPI_BFBASE, 0)); // BFBASE set later
++    p1_apb_write(de, RPI_BFNUM, len);
++    p1_apb_write(de, RPI_BFCONTROL, offset + (1<<7)); // Stop
++    p1_apb_write(de, RPI_BFCONTROL, offset + (rpi_use_emu<<6));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Wavefront mode
++
++static void wpp_decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts)
++{
++    const HEVCPPS * const pps = s->ps.pps;
++
++    int i, resetQPY=1;
++    int indep = !s->sh.dependent_slice_segment_flag;
++    int ctb_col = s->sh.slice_ctb_addr_rs % de->PicWidthInCtbsY;
++
++    if (ctb_addr_ts)
++        wpp_end_previous_slice(de, s, ctb_addr_ts);
++    pre_slice_decode(de, s);
++    WriteBitstream(de, s);
++    if (ctb_addr_ts==0 || indep || de->PicWidthInCtbsY==1)
++        WriteProb(de, s);
++    else if (ctb_col==0)
++        p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++    else
++        resetQPY=0;
++    program_slicecmds(de, s->slice_idx);
++    new_slice_segment(de, s);
++    wpp_entry_point(de, s, indep, resetQPY, ctb_addr_ts);
++    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++        int last_x = de->PicWidthInCtbsY-1;
++        if (de->PicWidthInCtbsY>2)
++            wpp_pause(de, ctb_row);
++        p1_apb_write(de, RPI_STATUS, (ctb_row<<18) + (last_x<<5) + 2);
++        if (de->PicWidthInCtbsY==2)
++            p1_apb_write(de, RPI_TRANSFER, PROB_BACKUP);
++        if (de->PicWidthInCtbsY==1)
++            WriteProb(de, s);
++        else
++            p1_apb_write(de, RPI_TRANSFER, PROB_RELOAD);
++        ctb_addr_ts += pps->column_width[0];
++        wpp_entry_point(de, s, 0, 1, ctb_addr_ts);
++    }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++// Tiles mode
++
++static void decode_slice(dec_env_t * const de, const HEVCContext * const s, int ctb_addr_ts) {
++    const HEVCPPS * const pps = s->ps.pps;
++    int i, resetQPY;
++
++    if (ctb_addr_ts) end_previous_slice(de, s, ctb_addr_ts);
++    pre_slice_decode(de, s);
++    WriteBitstream(de, s);
++    resetQPY = ctb_addr_ts==0
++            || pps->tile_id[ctb_addr_ts]!=pps->tile_id[ctb_addr_ts-1]
++            || !s->sh.dependent_slice_segment_flag;
++    if (resetQPY) WriteProb(de, s);
++    program_slicecmds(de, s->slice_idx);
++    new_slice_segment(de, s);
++    new_entry_point(de, s, !s->sh.dependent_slice_segment_flag, resetQPY, ctb_addr_ts);
++    for (i=0; i<s->sh.num_entry_point_offsets; i++) {
++        int ctb_addr_rs = pps->ctb_addr_ts_to_rs[ctb_addr_ts];
++        int ctb_col = ctb_addr_rs % de->PicWidthInCtbsY;
++        int ctb_row = ctb_addr_rs / de->PicWidthInCtbsY;
++        int tile_x = ctb_to_tile (ctb_col, pps->col_bd, pps->num_tile_columns);
++        int tile_y = ctb_to_tile (ctb_row, pps->row_bd, pps->num_tile_rows);
++        int last_x = pps->col_bd[tile_x+1]-1;
++        int last_y = pps->row_bd[tile_y+1]-1;
++        p1_apb_write(de, RPI_STATUS, 2 + (last_x<<5) + (last_y<<18));
++        WriteProb(de, s);
++        ctb_addr_ts += pps->column_width[tile_x] * pps->row_height[tile_y];
++        new_entry_point(de, s, 0, 1, ctb_addr_ts);
++    }
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int cabac_start_align(HEVCContext *s)
++{
++    GetBitContext *gb = &s->HEVClc->gb;
++    skip_bits(gb, 1);
++    align_get_bits(gb);
++    // Should look at getting rid of this
++    return ff_init_cabac_decoder(&s->HEVClc->cc,
++                          gb->buffer + get_bits_count(gb) / 8,
++                          (get_bits_left(gb) + 7) / 8);
++}
++
++static int rpi_hevc_decode_slice(
++    AVCodecContext *avctx,
++    const uint8_t *buffer,
++    uint32_t size)
++{
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    HEVCContext * const s = avctx->priv_data;
++    dec_env_t * const de = dec_env_get(avctx, rpi);
++    const HEVCPPS *pps = s->ps.pps;
++    int ctb_addr_ts = pps->ctb_addr_rs_to_ts[s->sh.slice_ctb_addr_rs];
++
++#if TRACE_ENTRY
++    printf("<<< %s[%p]\n", __func__, de);
++#endif
++    if (de == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Cannot find find context for thread\n", __func__);
++        return -1;
++    }
++
++    if (de->state != RPIVID_DECODE_START && de->state != RPIVID_DECODE_SLICE) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Unexpected state: %d\n", __func__, de->state);
++        return -1;
++    }
++    de->state = RPIVID_DECODE_SLICE;
++
++    de->nal_buffer = buffer;
++    de->nal_size   = size;
++
++#if !OPT_EMU
++//    ff_hevc_cabac_init(s, ctb_addr_ts);
++    cabac_start_align(s);
++#endif
++    if (s->ps.sps->scaling_list_enable_flag)
++        populate_scaling_factors(de, s);
++    pps->entropy_coding_sync_enabled_flag? wpp_decode_slice(de, s, ctb_addr_ts)
++                                             : decode_slice(de, s, ctb_addr_ts);
++#if TRACE_ENTRY
++    printf(">>> %s[%p]\n", __func__, de);
++#endif
++    dec_env_release(rpi, de);
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpivid_retrieve_data(void *logctx, AVFrame *frame)
++{
++    int rv;
++    if ((rv = av_rpi_zc_resolve_frame(frame, ZC_RESOLVE_WAIT_VALID)) != 0)
++        av_log(logctx, AV_LOG_ERROR, "Unable to resolve output frame\n");
++    return rv;
++}
++
++static int rpivid_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++    HEVCContext * const s = avctx->priv_data;
++    // Frame buffering + 1 output.  Would need thread_count extra but we now
++    // alloc at the start of phase 2 so that is the only thread we need the
++    // extra buffer for.
++    const unsigned int pool_req = s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering + 1;
++    int rv;
++
++    if (av_rpi_zc_in_use(avctx))
++    {
++        const AVZcEnvPtr zc = avctx->opaque;
++        av_rpi_zc_set_decoder_pool_size(zc, pool_req);
++        rv = av_rpi_zc_get_buffer(zc, frame);   // get_buffer2 would alloc
++    }
++    else
++    {
++        if (rpi->zc == NULL) {
++            pthread_mutex_lock(&rpi->phase_lock); // Abuse - not worth creating a lock just for this
++            // Alloc inside lock to make sure we only ever alloc one
++            if (rpi->zc == NULL) {
++                rpi->zc = av_rpi_zc_int_env_alloc(s);
++            }
++            pthread_mutex_unlock(&rpi->phase_lock);
++        }
++        av_rpi_zc_set_decoder_pool_size(rpi->zc, pool_req); // Ignored by local allocator, but set anyway :-)
++        rv = (rpi->zc == NULL) ? AVERROR(ENOMEM) :
++            av_rpi_zc_get_buffer(rpi->zc, frame);
++    }
++
++    if (rv == 0 &&
++        (rv = ff_attach_decode_data(frame)) < 0)
++    {
++        av_frame_unref(frame);
++    }
++
++    if (rv == 0)
++    {
++        FrameDecodeData *fdd = (FrameDecodeData*)frame->private_ref->data;
++        fdd->post_process = rpivid_retrieve_data;
++    }
++
++    return rv;
++}
++
++#if OPT_PHASE_TIMING
++static void log_bin_phase(AVCodecContext * const avctx, const unsigned int * const bins)
++{
++    av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d %7d\n",
++           bins[0],  bins[1], bins[2], bins[3],
++           bins[4],  bins[5], bins[6], bins[7], bins[8]);
++}
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_free(AVCodecContext *avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++
++#if TRACE_ENTRY
++    printf("<<< %s\n", __func__);
++#endif
++
++    dec_env_release(rpi, NULL);
++
++    // Wait for everything else to stop
++    {
++        struct timespec tt;
++        clock_gettime(CLOCK_REALTIME, &tt);
++        tt.tv_sec += 2;
++        while (sem_timedwait(&rpi->ref_zero, &tt) == -1) {
++            const int err = errno;
++            if (err == ETIMEDOUT) {
++                av_log(avctx, AV_LOG_FATAL, "Rpivid worker threads still running\n");
++                return -1;
++            }
++            if (err != EINTR) {
++                av_log(avctx, AV_LOG_ERROR, "Unexpected error %d waiting for work thread to stop\n", err);
++                break;
++            }
++        }
++    }
++
++#if OPT_PHASE_TIMING
++    {
++        unsigned int i;
++        for (i = 0; i != RPIVID_PHASES; ++i) {
++            const phase_wait_env_t * const p = rpi->phase_reqs + i;
++            av_log(avctx, AV_LOG_INFO, "Phase %u: In %3u.%06u, Out %3u.%06u\n", i,
++                   (unsigned int)(p->time_in_phase / 1000000), (unsigned int)(p->time_in_phase % 1000000),
++                   (unsigned int)(p->time_out_phase / 1000000), (unsigned int)(p->time_out_phase % 1000000));
++            av_log(avctx, AV_LOG_INFO, "%7d %7d %7d %7d %7d %7d %7d %7d        >\n",
++                   time_thresholds[0], time_thresholds[1], time_thresholds[2], time_thresholds[3],
++                   time_thresholds[4], time_thresholds[5], time_thresholds[6], time_thresholds[7]);
++            log_bin_phase(avctx, p->time_bins);
++            log_bin_phase(avctx, p->time_bins3);
++            log_bin_phase(avctx, p->time_bins5);
++            av_log(avctx, AV_LOG_INFO, "Longest duraction: %ums @ frame %u\n",
++                   (unsigned int)(p->max_phase_time / 1000),
++                   p->max_time_decode_order);
++        }
++        av_log(avctx, AV_LOG_INFO, "PU max=%d\n", rpi->max_pu_msgs);
++    }
++#endif
++
++    if (rpi->dec_envs != NULL)
++    {
++        for (int i; i < avctx->thread_count && rpi->dec_envs[i] != NULL; ++i) {
++            dec_env_delete(rpi->dec_envs[i]);
++        }
++        av_freep(&rpi->dec_envs);
++    }
++
++    av_rpi_zc_int_env_freep(&rpi->zc);
++
++    gpu_free(&rpi->gcolbuf);
++
++    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++        gpu_free(rpi->gbitbufs + i);
++    }
++    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++        gpu_free(rpi->gcoeffbufs + i);
++    }
++
++    unmap_devp(&rpi->regs, REGS_SIZE);
++    unmap_devp(&rpi->ints, INTS_SIZE);
++
++    if (rpi->gpu_init_type > 0)
++        rpi_mem_gpu_uninit();
++
++    if (rpi->mbox_fd >= 0) {
++        mbox_release_clock(rpi->mbox_fd);
++        mbox_close(rpi->mbox_fd);
++    }
++
++    sem_destroy(&rpi->ref_zero);
++    sem_destroy(&rpi->coeffbuf_sem);
++    sem_destroy(&rpi->bitbuf_sem);
++
++#if TRACE_ENTRY
++    printf(">>> %s\n", __func__);
++#endif
++    return 0;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++static int rpi_hevc_init(AVCodecContext *avctx) {
++    RPI_T * const rpi = avctx->internal->hwaccel_priv_data;
++//    const char *err;
++
++#if TRACE_ENTRY
++    printf("<<< %s\n", __func__);
++#endif
++
++    if (avctx->width>4096 || avctx->height>4096) {
++        av_log(NULL, AV_LOG_FATAL, "Picture size %dx%d exceeds 4096x4096 maximum for HWAccel\n", avctx->width, avctx->height);
++        return AVERROR(ENOTSUP);
++    }
++
++    memset(rpi, 0, sizeof(*rpi));
++
++    rpi->mbox_fd = -1;
++    rpi->decode_order = 0;
++
++    // Initial PU/COEFF stream buffer split chosen as worst case seen so far
++    rpi->max_pu_msgs = 768; // 7.2 says at most 1611 messages per CTU
++
++
++    atomic_store(&rpi->ref_count, 1);
++    sem_init(&rpi->ref_zero, 0, 0);
++
++    sem_init(&rpi->bitbuf_sem,   0, RPIVID_BITBUFS);
++    sem_init(&rpi->coeffbuf_sem, 0, RPIVID_COEFFBUFS);
++
++    pthread_mutex_init(&rpi->phase_lock, NULL);
++
++    if ((rpi->mbox_fd = mbox_open()) < 0)
++    {
++        av_log(avctx, AV_LOG_ERROR, "Failed to open mailbox\n");
++        goto fail;
++    }
++    mbox_request_clock(rpi->mbox_fd);
++
++    if ((rpi->regs = map_dev(avctx, REGS_NAME, REGS_SIZE)) == NULL ||
++        (rpi->ints = map_dev(avctx, INTS_NAME, INTS_SIZE)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to open rpivid devices\n");
++        goto fail;
++    }
++
++    if ((rpi->gpu_init_type = rpi_mem_gpu_init(0)) < 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to init GPU\n");
++        goto fail;
++    }
++
++    if ((rpi->dec_envs = av_mallocz(sizeof(dec_env_t *) * avctx->thread_count)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to alloc %d dec envs\n", avctx->thread_count);
++        goto fail;
++    }
++
++    rpi->col_stride = rnd64(avctx->width);
++    rpi->col_picsize = rpi->col_stride * (((avctx->height + 63) & ~63) >> 4);
++    if (gpu_malloc_uncached(rpi->col_picsize * RPIVID_COL_PICS, &rpi->gcolbuf) != 0)
++    {
++        av_log(avctx, AV_LOG_ERROR, "Failed to allocate col mv buffer\n");
++        goto fail;
++    }
++
++    for (unsigned int i = 0; i != RPIVID_BITBUFS; ++i) {
++        if (gpu_malloc_uncached(RPIVID_BITBUF_SIZE, rpi->gbitbufs + i) != 0)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Failed to allocate bitbuf %d\n", i);
++            goto fail;
++        }
++    }
++
++    for (unsigned int i = 0; i != RPIVID_COEFFBUFS; ++i) {
++        if (gpu_malloc_uncached(RPIVID_COEFFBUF_SIZE, rpi->gcoeffbufs + i) != 0)
++        {
++            av_log(avctx, AV_LOG_ERROR, "Failed to allocate coeffbuf %d\n", i);
++            goto fail;
++        }
++    }
++
++    av_log(avctx, AV_LOG_INFO, "RPI HEVC h/w accel init OK\n");
++
++    return 0;
++
++fail:
++    rpi_hevc_free(avctx);
++    return AVERROR_EXTERNAL;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++
++const AVHWAccel ff_hevc_rpi4_8_hwaccel = {
++    .name           = "hevc_rpi4_8",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_RPI4_8,
++    .alloc_frame    = rpivid_hevc_alloc_frame,
++    .start_frame    = rpi_hevc_start_frame,
++    .end_frame      = rpi_hevc_end_frame,
++    .abort_frame    = rpi_hevc_abort_frame,
++    .decode_slice   = rpi_hevc_decode_slice,
++    .init           = rpi_hevc_init,
++    .uninit         = rpi_hevc_free,
++    .priv_data_size = sizeof(RPI_T),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
++const AVHWAccel ff_hevc_rpi4_10_hwaccel = {
++    .name           = "hevc_rpi4_10",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_RPI4_10,
++    .alloc_frame    = rpivid_hevc_alloc_frame,
++    .start_frame    = rpi_hevc_start_frame,
++    .end_frame      = rpi_hevc_end_frame,
++    .abort_frame    = rpi_hevc_abort_frame,
++    .decode_slice   = rpi_hevc_decode_slice,
++    .init           = rpi_hevc_init,
++    .uninit         = rpi_hevc_free,
++    .priv_data_size = sizeof(RPI_T),
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
++
+diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
+index 4b2679eb38..8d80d19788 100644
+--- a/libavcodec/v4l2_buffers.c
++++ b/libavcodec/v4l2_buffers.c
+@@ -21,6 +21,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include <drm_fourcc.h>
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <sys/mman.h>
+@@ -29,57 +30,88 @@
+ #include <poll.h>
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
++#include "libavutil/avassert.h"
+ #include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext.h"
+ #include "v4l2_context.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_m2m.h"
++#include "v4l2_req_dmabufs.h"
++#include "weak_link.h"
+ 
+ #define USEC_PER_SEC 1000000
+-static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
++static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
+ 
+-static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
+ {
+-    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
+-        container_of(buf->context, V4L2m2mContext, output) :
+-        container_of(buf->context, V4L2m2mContext, capture);
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Buffer *buf)
++static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
+ {
+-    return buf_to_m2mctx(buf)->avctx;
++    return ctx_to_m2mctx(buf->context);
+ }
+ 
+-static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
++static inline AVCodecContext *logger(const V4L2Buffer * const buf)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    return buf_to_m2mctx(buf)->avctx;
++}
+ 
+-    if (s->avctx->pkt_timebase.num)
+-        return s->avctx->pkt_timebase;
+-    return s->avctx->time_base;
++static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
++{
++    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    const AVRational tb = s->avctx->pkt_timebase.num ?
++        s->avctx->pkt_timebase :
++        s->avctx->time_base;
++    return tb.num && tb.den ? tb : v4l2_timebase;
+ }
+ 
+-static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
++static inline struct timeval tv_from_int(const int64_t t)
+ {
+-    int64_t v4l2_pts;
++    return (struct timeval){
++        .tv_usec = t % USEC_PER_SEC,
++        .tv_sec  = t / USEC_PER_SEC
++    };
++}
+ 
+-    if (pts == AV_NOPTS_VALUE)
+-        pts = 0;
++static inline int64_t int_from_tv(const struct timeval t)
++{
++    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
++}
+ 
++static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
++{
+     /* convert pts to v4l2 timebase */
+-    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
+-    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
+-    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
++    const int64_t v4l2_pts =
++        pts == AV_NOPTS_VALUE ? 0 :
++            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
++    out->buf.timestamp = tv_from_int(v4l2_pts);
+ }
+ 
+-static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
++static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
+ {
+-    int64_t v4l2_pts;
+-
++    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
++    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
++#if 0
+     /* convert pts back to encoder timebase */
+-    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
+-                        avbuf->buf.timestamp.tv_usec;
++    return
++        avbuf->context->no_pts_rescale ? v4l2_pts :
++        v4l2_pts == 0 ? AV_NOPTS_VALUE :
++            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++#endif
++}
+ 
+-    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
++static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
++{
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        out->planes[plane].bytesused = bytesused;
++        out->planes[plane].length = length;
++    } else {
++        out->buf.bytesused = bytesused;
++        out->buf.length = length;
++    }
+ }
+ 
+ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+@@ -116,6 +148,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
+     return AVCOL_PRI_UNSPECIFIED;
+ }
+ 
++static void v4l2_set_color(V4L2Buffer *buf,
++                           const enum AVColorPrimaries avcp,
++                           const enum AVColorSpace avcs,
++                           const enum AVColorTransferCharacteristic avxc)
++{
++    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++    switch (avcp) {
++    case AVCOL_PRI_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        ycbcr = V4L2_YCBCR_ENC_709;
++        break;
++    case AVCOL_PRI_BT470M:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        ycbcr = V4L2_YCBCR_ENC_601;
++        break;
++    case AVCOL_PRI_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_PRI_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_PRI_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_PRI_BT2020:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    case AVCOL_PRI_SMPTE428:
++    case AVCOL_PRI_SMPTE431:
++    case AVCOL_PRI_SMPTE432:
++    case AVCOL_PRI_EBU3213:
++    case AVCOL_PRI_RESERVED:
++    case AVCOL_PRI_FILM:
++    case AVCOL_PRI_UNSPECIFIED:
++    default:
++        break;
++    }
++
++    switch (avcs) {
++    case AVCOL_SPC_RGB:
++        cs = V4L2_COLORSPACE_SRGB;
++        break;
++    case AVCOL_SPC_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        break;
++    case AVCOL_SPC_FCC:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        break;
++    case AVCOL_SPC_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_SPC_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_SPC_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_SPC_BT2020_CL:
++        cs = V4L2_COLORSPACE_BT2020;
++        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++        break;
++    case AVCOL_SPC_BT2020_NCL:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    default:
++        break;
++    }
++
++    switch (xfer) {
++    case AVCOL_TRC_BT709:
++        xfer = V4L2_XFER_FUNC_709;
++        break;
++    case AVCOL_TRC_IEC61966_2_1:
++        xfer = V4L2_XFER_FUNC_SRGB;
++        break;
++    case AVCOL_TRC_SMPTE240M:
++        xfer = V4L2_XFER_FUNC_SMPTE240M;
++        break;
++    case AVCOL_TRC_SMPTE2084:
++        xfer = V4L2_XFER_FUNC_SMPTE2084;
++        break;
++    default:
++        break;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++        buf->context->format.fmt.pix_mp.colorspace = cs;
++        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
++        buf->context->format.fmt.pix_mp.xfer_func = xfer;
++    } else {
++        buf->context->format.fmt.pix.colorspace = cs;
++        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
++        buf->context->format.fmt.pix.xfer_func = xfer;
++    }
++}
++
+ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+ {
+     enum v4l2_quantization qt;
+@@ -134,6 +265,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
+      return AVCOL_RANGE_UNSPECIFIED;
+ }
+ 
++static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
++{
++    const enum v4l2_quantization q =
++        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++            V4L2_QUANTIZATION_DEFAULT;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
++        buf->context->format.fmt.pix_mp.quantization = q;
++    } else {
++        buf->context->format.fmt.pix.quantization = q;
++    }
++}
++
+ static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
+ {
+     enum v4l2_ycbcr_encoding ycbcr;
+@@ -210,73 +355,178 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
+     return AVCOL_TRC_UNSPECIFIED;
+ }
+ 
+-static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
+ {
+-    V4L2Buffer* avbuf = opaque;
+-    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
++    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
++}
+ 
+-    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
+-        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
++static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
++{
++    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
++}
+ 
+-        if (s->reinit) {
+-            if (!atomic_load(&s->refcount))
+-                sem_post(&s->refsync);
+-        } else {
+-            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
+-                /* no need to queue more buffers to the driver */
+-                avbuf->status = V4L2BUF_AVAILABLE;
+-            }
+-            else if (avbuf->context->streamon)
+-                ff_v4l2_buffer_enqueue(avbuf);
+-        }
++static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
++{
++    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
++        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
++}
+ 
+-        av_buffer_unref(&avbuf->context_ref);
++static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
++{
++    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor *layer;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_objects = avbuf->num_planes;
++    drm_desc->nb_layers = 1;
++
++    layer = &drm_desc->layers[0];
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = avbuf->plane_info[i].offset;
++        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
+     }
++
++    switch (avbuf->context->av_pix_fmt) {
++    case AV_PIX_FMT_YUYV422:
++
++        layer->format = DRM_FORMAT_YUYV;
++        layer->nb_planes = 1;
++
++        break;
++
++    case AV_PIX_FMT_NV12:
++    case AV_PIX_FMT_NV21:
++
++        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
++            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 2;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
++        break;
++
++    case AV_PIX_FMT_YUV420P:
++
++        layer->format = DRM_FORMAT_YUV420;
++
++        if (avbuf->num_planes > 1)
++            break;
++
++        layer->nb_planes = 3;
++
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
++            avbuf->context->format.fmt.pix.height;
++        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
++
++        layer->planes[2].object_index = 0;
++        layer->planes[2].offset = layer->planes[1].offset +
++            ((avbuf->plane_info[0].bytesperline *
++              avbuf->context->format.fmt.pix.height) >> 2);
++        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
++        break;
++
++    default:
++        drm_desc->nb_layers = 0;
++        break;
++    }
++
++    return (uint8_t *) drm_desc;
+ }
+ 
+-static int v4l2_buf_increase_ref(V4L2Buffer *in)
++static void v4l2_free_bufref(void *opaque, uint8_t *data)
+ {
+-    V4L2m2mContext *s = buf_to_m2mctx(in);
++    AVBufferRef * bufref = (AVBufferRef *)data;
++    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
++    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
+ 
+-    if (in->context_ref)
+-        atomic_fetch_add(&in->context_refcount, 1);
+-    else {
+-        in->context_ref = av_buffer_ref(s->self_ref);
+-        if (!in->context_ref)
+-            return AVERROR(ENOMEM);
++    if (ctx != NULL) {
++        // Buffer still attached to context
++        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
+ 
+-        in->context_refcount = 1;
+-    }
++        ff_mutex_lock(&ctx->lock);
+ 
+-    in->status = V4L2BUF_RET_USER;
+-    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
++        ff_v4l2_buffer_set_avail(avbuf);
+ 
+-    return 0;
++        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
++            /* no need to queue more buffers to the driver */
++        }
++        else if (ctx->streamon) {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
++            avbuf->buf.timestamp.tv_sec = 0;
++            avbuf->buf.timestamp.tv_usec = 0;
++            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
++        }
++        else {
++            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
++        }
++
++        ff_mutex_unlock(&ctx->lock);
++    }
++
++    ff_weak_link_unlock(avbuf->context_wl);
++    av_buffer_unref(&bufref);
+ }
+ 
+-static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
++static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
+ {
+-    int ret;
++    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
++}
+ 
+-    if (plane >= in->num_planes)
+-        return AVERROR(EINVAL);
++static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
++{
++    int i, ret;
++    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);
+ 
+-    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
+-    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
+-                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
+-    if (!*buf)
+-        return AVERROR(ENOMEM);
++    for (i = 0; i < avbuf->num_planes; i++) {
++        int dma_fd = -1;
++        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
++
++        if (s->db_ctl != NULL) {
++            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
++                return AVERROR(ENOMEM);
++            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
++            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
++                avbuf->buf.m.planes[i].m.fd = dma_fd;
++            else
++                avbuf->buf.m.fd = dma_fd;
++        }
++        else {
++            struct v4l2_exportbuffer expbuf;
++            memset(&expbuf, 0, sizeof(expbuf));
++
++            expbuf.index = avbuf->buf.index;
++            expbuf.type = avbuf->buf.type;
++            expbuf.plane = i;
++
++            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
++            if (ret < 0)
++                return AVERROR(errno);
++            dma_fd = expbuf.fd;
++        }
+ 
+-    ret = v4l2_buf_increase_ref(in);
+-    if (ret)
+-        av_buffer_unref(buf);
++        avbuf->drm_frame.objects[i].size = blen;
++        avbuf->drm_frame.objects[i].fd = dma_fd;
++        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
++    }
+ 
+-    return ret;
++    return 0;
+ }
+ 
+ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
+ {
+     unsigned int bytesused, length;
++    int rv = 0;
+ 
+     if (plane >= out->num_planes)
+         return AVERROR(EINVAL);
+@@ -284,32 +534,57 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
+     length = out->plane_info[plane].length;
+     bytesused = FFMIN(size+offset, length);
+ 
+-    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
+-
+-    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
+-        out->planes[plane].bytesused = bytesused;
+-        out->planes[plane].length = length;
+-    } else {
+-        out->buf.bytesused = bytesused;
+-        out->buf.length = length;
++    if (size > length - offset) {
++        size = length - offset;
++        rv = AVERROR(ENOMEM);
+     }
+ 
+-    return 0;
++    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
++
++    set_buf_length(out, plane, bytesused, length);
++
++    return rv;
++}
++
++static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
++{
++    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
++    AVBufferRef * newbuf;
++
++    if (!bufref)
++        return NULL;
++
++    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
++    if (newbuf == NULL)
++        av_buffer_unref(&bufref);
++
++    avbuf->status = V4L2BUF_RET_USER;
++    return newbuf;
+ }
+ 
+ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+-    int i, ret;
++    int i;
+ 
+     frame->format = avbuf->context->av_pix_fmt;
+ 
+-    for (i = 0; i < avbuf->num_planes; i++) {
+-        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
+-        if (ret)
+-            return ret;
++    frame->buf[0] = wrap_avbuf(avbuf);
++    if (frame->buf[0] == NULL)
++        return AVERROR(ENOMEM);
+ 
++    if (buf_to_m2mctx(avbuf)->output_drm) {
++        /* 1. get references to the actual data */
++        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
++        return 0;
++    }
++
++
++    /* 1. get references to the actual data */
++    for (i = 0; i < avbuf->num_planes; i++) {
++        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
+         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
+-        frame->data[i] = frame->buf[i]->data;
+     }
+ 
+     /* fixup special cases */
+@@ -318,17 +593,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+     case AV_PIX_FMT_NV21:
+         if (avbuf->num_planes > 1)
+             break;
+-        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
+-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
++        frame->linesize[1] = frame->linesize[0];
++        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
+         break;
+ 
+     case AV_PIX_FMT_YUV420P:
+         if (avbuf->num_planes > 1)
+             break;
+-        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
+-        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
+-        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
+-        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
++        frame->linesize[1] = frame->linesize[0] / 2;
++        frame->linesize[2] = frame->linesize[1];
++        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
++        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
+         break;
+ 
+     default:
+@@ -338,68 +613,127 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
+     return 0;
+ }
+ 
++static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
++{
++    if (dst_stride == src_stride && w + 32 >= dst_stride) {
++        memcpy(dst, src, dst_stride * h);
++    }
++    else {
++        while (--h >= 0) {
++            memcpy(dst, src, w);
++            dst += dst_stride;
++            src += src_stride;
++        }
++    }
++}
++
++static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
++{
++    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
++}
++
++static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
++        return AVERROR(EINVAL);
++
++    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
++        // Only currently cope with single buffer types
++        if (out->buf.length != 1)
++            return AVERROR_PATCHWELCOME;
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->planes[0].m.fd = src->objects[0].fd;
++    }
++    else {
++        if (src->nb_objects != 1)
++            return AVERROR(EINVAL);
++
++        out->buf.m.fd      = src->objects[0].fd;
++    }
++
++    // No need to copy src AVDescriptor and if we did then we may confuse
++    // fd close on free
++    out->ref_buf = av_buffer_ref(frame->buf[0]);
++
++    return 0;
++}
++
+ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+ {
+-    int i, ret;
+-    struct v4l2_format fmt = out->context->format;
+-    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+-                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
+-    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
+-                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
+-    int is_planar_format = 0;
+-
+-    switch (pixel_format) {
+-    case V4L2_PIX_FMT_YUV420M:
+-    case V4L2_PIX_FMT_YVU420M:
+-#ifdef V4L2_PIX_FMT_YUV422M
+-    case V4L2_PIX_FMT_YUV422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU422M
+-    case V4L2_PIX_FMT_YVU422M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YUV444M
+-    case V4L2_PIX_FMT_YUV444M:
+-#endif
+-#ifdef V4L2_PIX_FMT_YVU444M
+-    case V4L2_PIX_FMT_YVU444M:
+-#endif
+-    case V4L2_PIX_FMT_NV12M:
+-    case V4L2_PIX_FMT_NV21M:
+-    case V4L2_PIX_FMT_NV12MT_16X16:
+-    case V4L2_PIX_FMT_NV12MT:
+-    case V4L2_PIX_FMT_NV16M:
+-    case V4L2_PIX_FMT_NV61M:
+-        is_planar_format = 1;
+-    }
+-
+-    if (!is_planar_format) {
+-        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
+-        int planes_nb = 0;
+-        int offset = 0;
+-
+-        for (i = 0; i < desc->nb_components; i++)
+-            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
+-
+-        for (i = 0; i < planes_nb; i++) {
+-            int size, h = height;
+-            if (i == 1 || i == 2) {
++    int i;
++    int num_planes = 0;
++    int pel_strides[4] = {0};
++
++    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
++
++    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
++        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
++        return -1;
++    }
++
++    for (i = 0; i != desc->nb_components; ++i) {
++        if (desc->comp[i].plane >= num_planes)
++            num_planes = desc->comp[i].plane + 1;
++        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
++    }
++
++    if (out->num_planes > 1) {
++        if (num_planes != out->num_planes) {
++            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
++            return -1;
++        }
++        for (i = 0; i != num_planes; ++i) {
++            int w = frame->width;
++            int h = frame->height;
++            if (is_chroma(desc, i, num_planes)) {
++                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
+                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
+             }
+-            size = frame->linesize[i] * h;
+-            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
+-            if (ret)
+-                return ret;
+-            offset += size;
++
++            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
++                   frame->data[i], frame->linesize[i],
++                   w * pel_strides[i], h);
++            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
+         }
+-        return 0;
+     }
++    else
++    {
++        unsigned int offset = 0;
++
++        for (i = 0; i != num_planes; ++i) {
++            int w = frame->width;
++            int h = frame->height;
++            int dst_stride = out->plane_info[0].bytesperline;
++            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
++
++            if (is_chroma(desc, i, num_planes)) {
++                // Is chroma
++                dst_stride >>= desc->log2_chroma_w;
++                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
++                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
++                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
++            }
++            else {
++                // Is luma or alpha
++                offset += dst_stride * out->context->height;
++            }
++            if (offset > out->plane_info[0].length) {
++                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
++                return -1;
++            }
+ 
+-    for (i = 0; i < out->num_planes; i++) {
+-        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
+-        if (ret)
+-            return ret;
++            cpy_2d(dst, dst_stride,
++                   frame->data[i], frame->linesize[i],
++                   w * pel_strides[i], h);
++        }
++        set_buf_length(out, 0, offset, out->plane_info[0].length);
+     }
+-
+     return 0;
+ }
+ 
+@@ -409,16 +743,31 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
+  *
+  ******************************************************************************/
+ 
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
+ {
+-    v4l2_set_pts(out, frame->pts);
+-
+-    return v4l2_buffer_swframe_to_buf(frame, out);
++    out->buf.flags = frame->key_frame ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
++    // Beware that colour info is held in format rather than the actual
++    // v4l2 buffer struct so this may not be as useful as you might hope
++    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
++    v4l2_set_color_range(out, frame->color_range);
++    // PTS & interlace are buffer vars
++    if (track_ts)
++        out->buf.timestamp = tv_from_int(track_ts);
++    else
++        v4l2_set_pts(out, frame->pts);
++    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
++
++    return frame->format == AV_PIX_FMT_DRM_PRIME ?
++        v4l2_buffer_primeframe_to_buf(frame, out) :
++        v4l2_buffer_swframe_to_buf(frame, out);
+ }
+ 
+ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ {
+     int ret;
++    V4L2Context * const ctx = avbuf->context;
+ 
+     av_frame_unref(frame);
+ 
+@@ -429,17 +778,32 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+     /* 2. get frame information */
+     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
++    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
++        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
++            AV_PICTURE_TYPE_NONE;
+     frame->color_primaries = v4l2_get_color_primaries(avbuf);
+     frame->colorspace = v4l2_get_color_space(avbuf);
+     frame->color_range = v4l2_get_color_range(avbuf);
+     frame->color_trc = v4l2_get_color_trc(avbuf);
+     frame->pts = v4l2_get_pts(avbuf);
+     frame->pkt_dts = AV_NOPTS_VALUE;
++    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
++    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
+ 
+     /* these values are updated also during re-init in v4l2_process_driver_event */
+-    frame->height = avbuf->context->height;
+-    frame->width = avbuf->context->width;
+-    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
++    frame->height = ctx->height;
++    frame->width = ctx->width;
++    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
++
++    if (ctx->selection.height && ctx->selection.width) {
++        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
++        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
++        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
++            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
++        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
++            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
++    }
+ 
+     /* 3. report errors upstream */
+     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
+@@ -452,15 +816,15 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
+ 
+ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+ {
+-    int ret;
+-
+     av_packet_unref(pkt);
+-    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
+-    if (ret)
+-        return ret;
++
++    pkt->buf = wrap_avbuf(avbuf);
++    if (pkt->buf == NULL)
++        return AVERROR(ENOMEM);
+ 
+     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
+-    pkt->data = pkt->buf->data;
++    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
++    pkt->flags = 0;
+ 
+     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
+         pkt->flags |= AV_PKT_FLAG_KEY;
+@@ -475,39 +839,107 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
+     return 0;
+ }
+ 
+-int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp)
+ {
+     int ret;
+ 
+-    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
+-    if (ret)
++    if (extlen) {
++        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
++        if (ret)
++            return ret;
++    }
++
++    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
++    if (ret && ret != AVERROR(ENOMEM))
+         return ret;
+ 
+-    v4l2_set_pts(out, pkt->pts);
++    if (timestamp)
++        out->buf.timestamp = tv_from_int(timestamp);
++    else
++        v4l2_set_pts(out, pkt->pts);
+ 
+-    if (pkt->flags & AV_PKT_FLAG_KEY)
+-        out->flags = V4L2_BUF_FLAG_KEYFRAME;
++    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
++        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
++        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
+ 
+-    return 0;
++    return ret;
++}
++
++int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
++{
++    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
+ }
+ 
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
++
++static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
++{
++    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
++    int i;
++
++    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
++        struct V4L2Plane_info *p = avbuf->plane_info + i;
++        if (p->mm_addr != NULL)
++            munmap(p->mm_addr, p->length);
++    }
++
++    if (avbuf->dmabuf[0] == NULL) {
++        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++            if (avbuf->drm_frame.objects[i].fd != -1)
++                close(avbuf->drm_frame.objects[i].fd);
++        }
++    }
++    else {
++        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
++            dmabuf_free(avbuf->dmabuf[i]);
++        }
++    }
++
++    av_buffer_unref(&avbuf->ref_buf);
++
++    ff_weak_link_unref(&avbuf->context_wl);
++
++    av_free(avbuf);
++}
++
++
++int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
+ {
+-    V4L2Context *ctx = avbuf->context;
+     int ret, i;
++    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
++    AVBufferRef * bufref;
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+ 
+-    avbuf->buf.memory = V4L2_MEMORY_MMAP;
++    *pbufref = NULL;
++    if (avbuf == NULL)
++        return AVERROR(ENOMEM);
++
++    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
++    if (bufref == NULL) {
++        av_free(avbuf);
++        return AVERROR(ENOMEM);
++    }
++
++    avbuf->context = ctx;
++    avbuf->buf.memory = mem;
+     avbuf->buf.type = ctx->type;
+     avbuf->buf.index = index;
+ 
++    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
++        avbuf->drm_frame.objects[i].fd = -1;
++    }
++
++    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
++
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.length = VIDEO_MAX_PLANES;
+         avbuf->buf.m.planes = avbuf->planes;
+     }
+ 
+-    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
++    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
+     if (ret < 0)
+-        return AVERROR(errno);
++        goto fail;
+ 
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->num_planes = 0;
+@@ -520,6 +952,8 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->num_planes = 1;
+ 
+     for (i = 0; i < avbuf->num_planes; i++) {
++        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
++            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
+ 
+         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
+@@ -527,25 +961,31 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+ 
+         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
+-                                           PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
++            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;
++
++            if (want_mmap)
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
+         } else {
+             avbuf->plane_info[i].length = avbuf->buf.length;
+-            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
+-                                          PROT_READ | PROT_WRITE, MAP_SHARED,
+-                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
++            avbuf->plane_info[i].offset = 0;
++
++            if (want_mmap)
++                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
++                                               PROT_READ | PROT_WRITE, MAP_SHARED,
++                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
+         }
+ 
+-        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
+-            return AVERROR(ENOMEM);
++        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
++            avbuf->plane_info[i].mm_addr = NULL;
++            ret = AVERROR(ENOMEM);
++            goto fail;
++        }
+     }
+ 
+     avbuf->status = V4L2BUF_AVAILABLE;
+ 
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+-        return 0;
+-
+     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+         avbuf->buf.m.planes = avbuf->planes;
+         avbuf->buf.length   = avbuf->num_planes;
+@@ -555,20 +995,53 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
+         avbuf->buf.length    = avbuf->planes[0].length;
+     }
+ 
+-    return ff_v4l2_buffer_enqueue(avbuf);
++    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
++        if (s->output_drm) {
++            ret = v4l2_buffer_export_drm(avbuf);
++            if (ret) {
++                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
++                goto fail;
++            }
++        }
++    }
++
++    *pbufref = bufref;
++    return 0;
++
++fail:
++    av_buffer_unref(&bufref);
++    return ret;
+ }
+ 
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
+ {
+     int ret;
++    int qc;
+ 
+-    avbuf->buf.flags = avbuf->flags;
++    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
++        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++               avbuf->context->name, avbuf->buf.index,
++               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
++               avbuf->context->q_count);
++    }
+ 
+     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
+-    if (ret < 0)
+-        return AVERROR(errno);
++    if (ret < 0) {
++        int err = errno;
++        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
++               avbuf->context->name, avbuf->buf.index,
++               err, strerror(err));
++        return AVERROR(err);
++    }
+ 
++    // Lock not wanted - if called from buffer free then lock already obtained
++    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
+     avbuf->status = V4L2BUF_IN_DRIVER;
++    pthread_cond_broadcast(&avbuf->context->cond);
++
++    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
++           avbuf->context->name, avbuf->buf.index,
++           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
+ 
+     return 0;
+ }
+diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
+index 8dbc7fc104..0bda4dd06b 100644
+--- a/libavcodec/v4l2_buffers.h
++++ b/libavcodec/v4l2_buffers.h
+@@ -27,29 +27,44 @@
+ #include <stdatomic.h>
+ #include <linux/videodev2.h>
+ 
++#include "libavutil/hwcontext_drm.h"
+ #include "avcodec.h"
+ 
+ enum V4L2Buffer_status {
+     V4L2BUF_AVAILABLE,
+     V4L2BUF_IN_DRIVER,
++    V4L2BUF_IN_USE,
+     V4L2BUF_RET_USER,
+ };
+ 
+ /**
+  * V4L2Buffer (wrapper for v4l2_buffer management)
+  */
++struct V4L2Context;
++struct ff_weak_link_client;
++struct dmabuf_h;
++
+ typedef struct V4L2Buffer {
+-    /* each buffer needs to have a reference to its context */
++    /* each buffer needs to have a reference to its context
++     * The pointer is good enough for most operation but once the buffer has
++     * been passed to the user the buffer may become orphaned so for free ops
++     * the weak link must be used to ensure that the context is actually
++     * there
++     */
+     struct V4L2Context *context;
++    struct ff_weak_link_client *context_wl;
+ 
+-    /* This object is refcounted per-plane, so we need to keep track
+-     * of how many context-refs we are holding. */
+-    AVBufferRef *context_ref;
+-    atomic_uint context_refcount;
++    /* DRM descriptor */
++    AVDRMFrameDescriptor drm_frame;
++    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
++     * are done
++     */
++    AVBufferRef * ref_buf;
+ 
+     /* keep track of the mmap address and mmap length */
+     struct V4L2Plane_info {
+-        int bytesperline;
++        size_t bytesperline;
++        size_t offset;
+         void * mm_addr;
+         size_t length;
+     } plane_info[VIDEO_MAX_PLANES];
+@@ -60,9 +75,9 @@ typedef struct V4L2Buffer {
+     struct v4l2_buffer buf;
+     struct v4l2_plane planes[VIDEO_MAX_PLANES];
+ 
+-    int flags;
+     enum V4L2Buffer_status status;
+ 
++    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
+ } V4L2Buffer;
+ 
+ /**
+@@ -98,6 +113,10 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
+  */
+ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+ 
++int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
++                                    const void *extdata, size_t extlen,
++                                    const int64_t timestamp);
++
+ /**
+  * Extracts the data from an AVFrame to a V4L2Buffer
+  *
+@@ -106,7 +125,7 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
++int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
+ 
+ /**
+  * Initializes a V4L2Buffer
+@@ -116,7 +135,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
+  *
+  * @returns 0 in case of success, a negative AVERROR code otherwise
+  */
+-int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
++int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
+ 
+ /**
+  * Enqueues a V4L2Buffer
+@@ -127,5 +146,12 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
+  */
+ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
+ 
++static inline void
++ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
++{
++    avbuf->status = V4L2BUF_AVAILABLE;
++    av_buffer_unref(&avbuf->ref_buf);
++}
++
+ 
+ #endif // AVCODEC_V4L2_BUFFERS_H
+diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
+index ff1ea8e57b..65b2648557 100644
+--- a/libavcodec/v4l2_context.c
++++ b/libavcodec/v4l2_context.c
+@@ -27,11 +27,13 @@
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <poll.h>
++#include "libavutil/avassert.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+ #include "v4l2_buffers.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "weak_link.h"
+ 
+ struct v4l2_format_update {
+     uint32_t v4l2_fmt;
+@@ -41,26 +43,168 @@ struct v4l2_format_update {
+     int update_avfmt;
+ };
+ 
+-static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
++
++static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
+ {
+-    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
+-        container_of(ctx, V4L2m2mContext, output) :
+-        container_of(ctx, V4L2m2mContext, capture);
++    return (int64_t)n;
+ }
+ 
+-static inline AVCodecContext *logger(V4L2Context *ctx)
++static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
+ {
+-    return ctx_to_m2mctx(ctx)->avctx;
++    return (unsigned int)pts;
+ }
+ 
+-static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
++// FFmpeg requires us to propagate a number of vars from the coded pkt into
++// the decoded frame. The only thing that tracks like that in V4L2 stateful
++// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
++// guarantees about PTS being unique or specified for every frame so replace
++// the supplied PTS with a simple incrementing number and keep a circular
++// buffer of all the things we want preserved (including the original PTS)
++// indexed by the tracking no.
++static int64_t
++xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = avpkt->size,
++        .pts              = avpkt->pts,
++        .dts              = avpkt->dts,
++        .reordered_opaque = avctx->reordered_opaque,
++        .pkt_pos          = avpkt->pos,
++        .pkt_duration     = avpkt->duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
++}
++
++static int64_t
++xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
++{
++    int64_t track_pts;
++
++    // Avoid 0
++    if (++x->track_no == 0)
++        x->track_no = 1;
++
++    track_pts = track_to_pts(avctx, x->track_no);
++
++    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
++    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
++        .discard          = 0,
++        .pending          = 1,
++        .pkt_size         = 0,
++        .pts              = frame->pts,
++        .dts              = AV_NOPTS_VALUE,
++        .reordered_opaque = frame->reordered_opaque,
++        .pkt_pos          = frame->pkt_pos,
++        .pkt_duration     = frame->pkt_duration,
++        .track_pts        = track_pts
++    };
++    return track_pts;
+ }
+ 
+-static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_frame_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVFrame *const frame)
+ {
+-    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
++    {
++        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        frame->pts              = AV_NOPTS_VALUE;
++        frame->pkt_dts          = AV_NOPTS_VALUE;
++        frame->reordered_opaque = x->last_opaque;
++        frame->pkt_pos          = -1;
++        frame->pkt_duration     = 0;
++        frame->pkt_size         = -1;
++    }
++    else if (!t->discard)
++    {
++        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
++        frame->pkt_dts          = t->dts;
++        frame->reordered_opaque = t->reordered_opaque;
++        frame->pkt_pos          = t->pkt_pos;
++        frame->pkt_duration     = t->pkt_duration;
++        frame->pkt_size         = t->pkt_size;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (frame->pts != AV_NOPTS_VALUE)
++            x->last_pts = frame->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
++        return -1;
++    }
++
++    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
++    return 0;
++}
++
++// Returns -1 if we should discard the frame
++static int
++xlat_pts_pkt_out(AVCodecContext *const avctx,
++             xlat_track_t * const x,
++             AVPacket *const pkt)
++{
++    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
++    V4L2m2mTrackEl *const t = x->track_els + n;
++    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
++    {
++        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
++               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        pkt->pts                = AV_NOPTS_VALUE;
++    }
++    else if (!t->discard)
++    {
++        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
++
++        x->last_opaque = x->track_els[n].reordered_opaque;
++        if (pkt->pts != AV_NOPTS_VALUE)
++            x->last_pts = pkt->pts;
++        t->pending = 0;
++    }
++    else
++    {
++        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
++        return -1;
++    }
++
++    // * Would like something much better than this...xlat(offset + out_count)?
++    pkt->dts = pkt->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
++           pkt->pts, t->track_pts, n);
++    return 0;
++}
++
++
++static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
++{
++    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
++        container_of(ctx, V4L2m2mContext, output) :
++        container_of(ctx, V4L2m2mContext, capture);
++}
++
++static inline AVCodecContext *logger(const V4L2Context *ctx)
++{
++    return ctx_to_m2mctx(ctx)->avctx;
+ }
+ 
+ static AVRational v4l2_get_sar(V4L2Context *ctx)
+@@ -81,21 +225,29 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
+     return sar;
+ }
+ 
+-static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
++static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
++{
++    return ctx->bufrefs != NULL;
++}
++
++// Width/Height changed or we don't have an alloc in the first place?
++static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
+ {
+-    struct v4l2_format *fmt1 = &ctx->format;
+-    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
+-        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
+-        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
+-        :
+-        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
+-        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
++    const struct v4l2_format *fmt1 = &ctx->format;
++    int ret = !ctx_buffers_alloced(ctx) ||
++        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
++            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
++            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
++            :
++            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
++            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
+ 
+     if (ret)
+-        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
++        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
+             ctx->name,
+-            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
+-            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
++            ctx_buffers_alloced(ctx),
++            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
++            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
+ 
+     return ret;
+ }
+@@ -153,90 +305,110 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
+     }
+ }
+ 
+-/**
+- * handle resolution change event and end of stream event
+- * returns 1 if reinit was successful, negative if it failed
+- * returns 0 if reinit was not executed
+- */
+-static int v4l2_handle_event(V4L2Context *ctx)
++static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
+-    struct v4l2_format cap_fmt = s->capture.format;
+-    struct v4l2_format out_fmt = s->output.format;
+-    struct v4l2_event evt = { 0 };
+-    int full_reinit, reinit, ret;
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++    struct v4l2_selection selection = {
++        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
++        .target = V4L2_SEL_TGT_COMPOSE
++    };
+ 
+-    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
+-    if (ret < 0) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
+-        return 0;
+-    }
++    memset(r, 0, sizeof(*r));
++    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
++        return AVERROR(errno);
+ 
+-    if (evt.type == V4L2_EVENT_EOS) {
+-        ctx->done = 1;
+-        return 0;
+-    }
++    *r = selection.r;
++    return 0;
++}
+ 
+-    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
+-        return 0;
++static int do_source_change(V4L2m2mContext * const s)
++{
++    AVCodecContext *const avctx = s->avctx;
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &out_fmt);
+-    if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->output.name);
+-        return 0;
+-    }
++    int ret;
++    int reinit;
++    struct v4l2_format cap_fmt = s->capture.format;
++
++    s->capture.done = 0;
+ 
+     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
+     if (ret) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
++        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
+         return 0;
+     }
+ 
+-    full_reinit = v4l2_resolution_changed(&s->output, &out_fmt);
+-    if (full_reinit) {
+-        s->output.height = v4l2_get_height(&out_fmt);
+-        s->output.width = v4l2_get_width(&out_fmt);
+-        s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
+-    }
++    get_default_selection(&s->capture, &s->capture.selection);
++
++    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
++    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
++        reinit = 1;
+ 
+-    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
++    s->capture.format = cap_fmt;
+     if (reinit) {
+-        s->capture.height = v4l2_get_height(&cap_fmt);
+-        s->capture.width = v4l2_get_width(&cap_fmt);
+-        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
++        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
+     }
+ 
+-    if (full_reinit || reinit)
+-        s->reinit = 1;
+-
+-    if (full_reinit) {
+-        ret = ff_v4l2_m2m_codec_full_reinit(s);
+-        if (ret) {
+-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_full_reinit\n");
+-            return AVERROR(EINVAL);
+-        }
+-        goto reinit_run;
++    // If we don't support selection (or it is bust) and we obviously have HD then kludge
++    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
++        (s->capture.height == 1088 && s->capture.width == 1920)) {
++        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
+     }
+ 
++    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
++
++    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
++           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
++           s->capture.width, s->capture.height,
++           s->capture.selection.width, s->capture.selection.height,
++           s->capture.selection.left, s->capture.selection.top, reinit);
++
+     if (reinit) {
+-        if (s->avctx)
+-            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
++        if (avctx)
++            ret = ff_set_dimensions(s->avctx,
++                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
++                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
+         if (ret < 0)
+-            av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
++            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
+ 
+         ret = ff_v4l2_m2m_codec_reinit(s);
+         if (ret) {
+-            av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
++            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
+             return AVERROR(EINVAL);
+         }
++
++        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
++            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
++                   s->capture.width, s->capture.height,
++                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
++            return AVERROR(EINVAL);
++        }
++
++        // Update pixel format - should only actually do something on initial change
++        s->capture.av_pix_fmt =
++            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
++        if (s->output_drm) {
++            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
++        }
++        else
++            avctx->pix_fmt = s->capture.av_pix_fmt;
++
+         goto reinit_run;
+     }
+ 
+-    /* dummy event received */
+-    return 0;
++    /* Buffers are OK so just stream off to ack */
++    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
++
++    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++    if (ret)
++        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
++    s->draining = 0;
+ 
+     /* reinit executed */
+ reinit_run:
++    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
+     return 1;
+ }
+ 
+@@ -280,171 +452,293 @@ static int v4l2_stop_encode(V4L2Context *ctx)
+     return 0;
+ }
+ 
+-static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
++// DQ a buffer
++// Amalgamates all the various ways there are of signalling EOS/Event to
++// generate a consistant EPIPE.
++//
++// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
++//
++// Returns:
++//  0               Success
++//  AVERROR(EPIPE)  Nothing more to read
++//  AVERROR(ENOSPC) No buffers in Q to put result in
++//  *               AVERROR(..)
++
++ static int
++dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
+ {
+-    struct v4l2_plane planes[VIDEO_MAX_PLANES];
+-    struct v4l2_buffer buf = { 0 };
+-    V4L2Buffer *avbuf;
+-    struct pollfd pfd = {
+-        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
+-        .fd = ctx_to_m2mctx(ctx)->fd,
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    V4L2Buffer * avbuf;
++    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
++
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++    struct v4l2_buffer buf = {
++        .type = ctx->type,
++        .memory = V4L2_MEMORY_MMAP,
+     };
+-    int i, ret;
+ 
+-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+-                break;
+-        }
+-        if (i == ctx->num_buffers)
+-            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
+-                                                "userspace. Increase num_capture_buffers "
+-                                                "to prevent device deadlock or dropped "
+-                                                "packets/frames.\n");
+-    }
+-
+-    /* if we are draining and there are no more capture buffers queued in the driver we are done */
+-    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
+-        for (i = 0; i < ctx->num_buffers; i++) {
+-            /* capture buffer initialization happens during decode hence
+-             * detection happens at runtime
+-             */
+-            if (!ctx->buffers)
+-                break;
+-
+-            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
+-                goto start;
++    *ppavbuf = NULL;
++
++    if (ctx->flag_last)
++        return AVERROR(EPIPE);
++
++    if (is_mp) {
++        buf.length = VIDEO_MAX_PLANES;
++        buf.m.planes = planes;
++    }
++
++    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
++        const int err = errno;
++        av_assert0(AVERROR(err) < 0);
++        if (err != EINTR) {
++            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
++                ctx->name, av_err2str(AVERROR(err)));
++
++            if (err == EPIPE)
++                ctx->flag_last = 1;
++
++            return AVERROR(err);
+         }
+-        ctx->done = 1;
+-        return NULL;
+     }
++    atomic_fetch_sub(&ctx->q_count, 1);
++
++    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
++    ff_v4l2_buffer_set_avail(avbuf);
++    avbuf->buf = buf;
++    if (is_mp) {
++        memcpy(avbuf->planes, planes, sizeof(planes));
++        avbuf->buf.m.planes = avbuf->planes;
++    }
++    // Done with any attached buffer
++    av_buffer_unref(&avbuf->ref_buf);
+ 
+-start:
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
+-        pfd.events =  POLLOUT | POLLWRNORM;
+-    else {
+-        /* no need to listen to requests for more input while draining */
+-        if (ctx_to_m2mctx(ctx)->draining)
+-            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
++    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
++        // Zero length cap buffer return == EOS
++        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
++
++            // Must reQ so we don't leak
++            // May not matter if the next thing we do is release all the
++            // buffers but better to be tidy.
++            ff_v4l2_buffer_enqueue(avbuf);
++
++            ctx->flag_last = 1;
++            return AVERROR(EPIPE);
++        }
++
++#ifdef V4L2_BUF_FLAG_LAST
++        // If flag_last set then this contains data but is the last frame
++        // so remember that but return OK
++        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
++            ctx->flag_last = 1;
++#endif
+     }
+ 
+-    for (;;) {
+-        ret = poll(&pfd, 1, timeout);
+-        if (ret > 0)
+-            break;
+-        if (errno == EINTR)
++    *ppavbuf = avbuf;
++    return 0;
++}
++
++/**
++ * handle resolution change event and end of stream event
++ * Expects to be called after the stream has stopped
++ *
++ * returns 1 if reinit was successful, negative if it failed
++ * returns 0 if reinit was not executed
++ */
++static int
++get_event(V4L2m2mContext * const m)
++{
++    AVCodecContext * const avctx = m->avctx;
++    struct v4l2_event evt = { 0 };
++
++    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
++        const int rv = AVERROR(errno);
++        if (rv == AVERROR(EINTR))
+             continue;
+-        return NULL;
++        if (rv == AVERROR(EAGAIN)) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
++            return AVERROR_EOF;
++        }
++        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
++        return rv;
+     }
+ 
+-    /* 0. handle errors */
+-    if (pfd.revents & POLLERR) {
+-        /* if we are trying to get free buffers but none have been queued yet
+-           no need to raise a warning */
+-        if (timeout == 0) {
+-            for (i = 0; i < ctx->num_buffers; i++) {
+-                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
+-                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
+-            }
+-        }
+-        else
+-            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
++    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
+ 
+-        return NULL;
++    if (evt.type == V4L2_EVENT_EOS) {
++        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
++        return AVERROR_EOF;
+     }
+ 
+-    /* 1. handle resolution changes */
+-    if (pfd.revents & POLLPRI) {
+-        ret = v4l2_handle_event(ctx);
+-        if (ret < 0) {
+-            /* if re-init failed, abort */
+-            ctx->done = 1;
+-            return NULL;
+-        }
+-        if (ret) {
+-            /* if re-init was successful drop the buffer (if there was one)
+-             * since we had to reconfigure capture (unmap all buffers)
+-             */
+-            return NULL;
++    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
++        return do_source_change(m);
++
++    return 0;
++}
++
++static inline int
++dq_ok(const V4L2Context * const c)
++{
++    return c->streamon && atomic_load(&c->q_count) != 0;
++}
++
++// Get a buffer
++// If output then just gets the buffer in the expected way
++// If capture then runs the capture state m/c to deal with res change etc.
++// If return value == 0 then *ppavbuf != NULL
++
++static int
++get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
++{
++    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
++    AVCodecContext * const avctx = m->avctx;
++    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
++
++    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
++    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
++    const unsigned int poll_event = POLLPRI;
++
++    *ppavbuf = NULL;
++
++    for (;;) {
++        struct pollfd pfd = {
++            .fd = m->fd,
++            // If capture && stream not started then assume we are waiting for the initial event
++            .events = !is_cap ? poll_out :
++                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
++                    poll_event,
++        };
++        int ret;
++
++        if (ctx->done) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
++            return AVERROR_EOF;
+         }
+-    }
+ 
+-    /* 2. dequeue the buffer */
+-    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
++        // If capture && timeout == -1 then also wait for rx buffer free
++        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
++            pfd.events |= poll_out;
+ 
+-        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            /* there is a capture buffer ready */
+-            if (pfd.revents & (POLLIN | POLLRDNORM))
+-                goto dequeue;
++        // If nothing Qed all we will get is POLLERR - avoid that
++        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
++            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
++            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
++            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
++            return AVERROR(ENOSPC);
++        }
+ 
+-            /* the driver is ready to accept more input; instead of waiting for the capture
+-             * buffer to complete we return NULL so input can proceed (we are single threaded)
+-             */
+-            if (pfd.revents & (POLLOUT | POLLWRNORM))
+-                return NULL;
++        // Timeout kludged s.t. "forever" eventually gives up & produces logging
++        // If waiting for an event when we have seen a last_frame then we expect
++        //   it to be ready already so force a short timeout
++        ret = poll(&pfd, 1,
++                   ff_v4l2_ctx_eos(ctx) ? 10 :
++                   timeout == -1 ? 3000 : timeout);
++        if (ret < 0) {
++            ret = AVERROR(errno);  // Remember errno before logging etc.
++            av_assert0(ret < 0);
+         }
+ 
+-dequeue:
+-        memset(&buf, 0, sizeof(buf));
+-        buf.memory = V4L2_MEMORY_MMAP;
+-        buf.type = ctx->type;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memset(planes, 0, sizeof(planes));
+-            buf.length = VIDEO_MAX_PLANES;
+-            buf.m.planes = planes;
++        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
++               ctx->name, ret, timeout, pfd.events, pfd.revents);
++
++        if (ret < 0) {
++            if (ret == AVERROR(EINTR))
++                continue;
++            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
++            return ret;
+         }
+ 
+-        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
+-        if (ret) {
+-            if (errno != EAGAIN) {
+-                ctx->done = 1;
+-                if (errno != EPIPE)
+-                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
+-                        ctx->name, av_err2str(AVERROR(errno)));
++        if (ret == 0) {
++            if (timeout == -1)
++                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
++            if (ff_v4l2_ctx_eos(ctx)) {
++                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
++                ret = get_event(m);
++                if (ret < 0) {
++                    ctx->done = 1;
++                    return ret;
++                }
+             }
+-            return NULL;
++            return AVERROR(EAGAIN);
+         }
+ 
+-        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
+-                            buf.m.planes[0].bytesused : buf.bytesused;
+-            if (bytesused == 0) {
++        if ((pfd.revents & POLLERR) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
++            return AVERROR_UNKNOWN;
++        }
++
++        if ((pfd.revents & poll_event) != 0) {
++            ret = get_event(m);
++            if (ret < 0) {
+                 ctx->done = 1;
+-                return NULL;
++                return ret;
+             }
+-#ifdef V4L2_BUF_FLAG_LAST
+-            if (buf.flags & V4L2_BUF_FLAG_LAST)
+-                ctx->done = 1;
+-#endif
++            continue;
+         }
+ 
+-        avbuf = &ctx->buffers[buf.index];
+-        avbuf->status = V4L2BUF_AVAILABLE;
+-        avbuf->buf = buf;
+-        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
+-            memcpy(avbuf->planes, planes, sizeof(planes));
+-            avbuf->buf.m.planes = avbuf->planes;
++        if ((pfd.revents & poll_cap) != 0) {
++            ret = dq_buf(ctx, ppavbuf);
++            if (ret == AVERROR(EPIPE))
++                continue;
++            return ret;
+         }
+-        return avbuf;
++
++        if ((pfd.revents & poll_out) != 0) {
++            if (is_cap)
++                return AVERROR(EAGAIN);
++            return dq_buf(ctx, ppavbuf);
++        }
++
++        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
++        return AVERROR_UNKNOWN;
+     }
++}
+ 
+-    return NULL;
++// Clear out flags and timestamps that should should be set by the user
++// Returns the passed avbuf
++static V4L2Buffer *
++clean_v4l2_buffer(V4L2Buffer * const avbuf)
++{
++    struct v4l2_buffer *const buf = &avbuf->buf;
++
++    buf->flags = 0;
++    buf->field = V4L2_FIELD_ANY;
++    buf->timestamp = (struct timeval){0};
++    buf->timecode = (struct v4l2_timecode){0};
++    buf->sequence = 0;
++
++    return avbuf;
++}
++
++int
++ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
++{
++    V4L2Buffer * avbuf;
++    if (timeout1 != 0) {
++        int rv = get_qbuf(ctx, &avbuf, timeout1);
++        if (rv != 0)
++            return rv;
++    }
++    do {
++        get_qbuf(ctx, &avbuf, 0);
++    } while (avbuf);
++    return 0;
+ }
+ 
+ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ {
+-    int timeout = 0; /* return when no more buffers to dequeue */
+     int i;
+ 
+     /* get back as many output buffers as possible */
+-    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
+-          do {
+-          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
+-    }
++    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
++        ff_v4l2_dq_all(ctx, 0);
+ 
+     for (i = 0; i < ctx->num_buffers; i++) {
+-        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
+-            return &ctx->buffers[i];
++        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (avbuf->status == V4L2BUF_AVAILABLE)
++            return clean_v4l2_buffer(avbuf);
+     }
+ 
+     return NULL;
+@@ -452,25 +746,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
+ 
+ static int v4l2_release_buffers(V4L2Context* ctx)
+ {
+-    struct v4l2_requestbuffers req = {
+-        .memory = V4L2_MEMORY_MMAP,
+-        .type = ctx->type,
+-        .count = 0, /* 0 -> unmaps buffers from the driver */
+-    };
+-    int i, j;
++    int i;
++    int ret = 0;
++    const int fd = ctx_to_m2mctx(ctx)->fd;
+ 
+-    for (i = 0; i < ctx->num_buffers; i++) {
+-        V4L2Buffer *buffer = &ctx->buffers[i];
++    // Orphan any buffers in the wild
++    ff_weak_link_break(&ctx->wl_master);
++
++    if (ctx->bufrefs) {
++        for (i = 0; i < ctx->num_buffers; i++)
++            av_buffer_unref(ctx->bufrefs + i);
++    }
++
++    if (fd != -1) {
++        struct v4l2_requestbuffers req = {
++            .memory = V4L2_MEMORY_MMAP,
++            .type = ctx->type,
++            .count = 0, /* 0 -> unmap all buffers from the driver */
++        };
+ 
+-        for (j = 0; j < buffer->num_planes; j++) {
+-            struct V4L2Plane_info *p = &buffer->plane_info[j];
+-            if (p->mm_addr && p->length)
+-                if (munmap(p->mm_addr, p->length) < 0)
+-                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
++        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
++            if (errno == EINTR)
++                continue;
++
++            ret = AVERROR(errno);
++
++            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
++                ctx->name, av_err2str(AVERROR(errno)));
++
++            if (ctx_to_m2mctx(ctx)->output_drm)
++                av_log(logger(ctx), AV_LOG_ERROR,
++                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
++                    "for all buffers: \n"
++                    "  1. drmModeRmFB(..)\n"
++                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
+         }
+     }
++    atomic_store(&ctx->q_count, 0);
+ 
+-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
++    return ret;
+ }
+ 
+ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
+@@ -499,6 +813,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
+ 
+ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+ {
++    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
++    V4L2m2mPriv *priv = s->avctx->priv_data;
+     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
+     struct v4l2_fmtdesc fdesc;
+     int ret;
+@@ -512,21 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
+             return 0;
+     }
+ 
+-    for (;;) {
++    for (;; ++fdesc.index) {
+         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
+         if (ret)
+             return AVERROR(EINVAL);
+ 
++        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
++            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
++                continue;
++        }
++
+         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
+         ret = v4l2_try_raw_format(ctx, pixfmt);
+-        if (ret){
+-            fdesc.index++;
+-            continue;
++        if (ret == 0) {
++            *p = pixfmt;
++            return 0;
+         }
+-
+-        *p = pixfmt;
+-
+-        return 0;
+     }
+ 
+     return AVERROR(EINVAL);
+@@ -569,30 +886,99 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
+   *
+   *****************************************************************************/
+ 
++
++static void flush_all_buffers_status(V4L2Context* const ctx)
++{
++    int i;
++
++    if (!ctx->bufrefs)
++        return;
++
++    for (i = 0; i < ctx->num_buffers; ++i) {
++        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (buf->status == V4L2BUF_IN_DRIVER)
++            ff_v4l2_buffer_set_avail(buf);
++    }
++    atomic_store(&ctx->q_count, 0);
++}
++
++static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
++{
++    int i;
++    int rv;
++
++    if (!ctx->bufrefs) {
++        rv = ff_v4l2_context_init(ctx);
++        if (rv) {
++            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
++            return rv;
++        }
++    }
++
++    for (i = 0; i < ctx->num_buffers; ++i) {
++        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
++        if (buf->status == V4L2BUF_AVAILABLE) {
++            rv = ff_v4l2_buffer_enqueue(buf);
++            if (rv < 0)
++                return rv;
++        }
++    }
++    return 0;
++}
++
+ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
+ {
+     int type = ctx->type;
+-    int ret;
++    int ret = 0;
++    AVCodecContext * const avctx = logger(ctx);
+ 
+-    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
+-    if (ret < 0)
+-        return AVERROR(errno);
++    // Avoid doing anything if there is nothing we can do
++    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
++        return 0;
+ 
+-    ctx->streamon = (cmd == VIDIOC_STREAMON);
++    ff_mutex_lock(&ctx->lock);
+ 
+-    return 0;
++    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
++        stuff_all_buffers(avctx, ctx);
++
++    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
++        const int err = errno;
++        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
++               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
++        ret = AVERROR(err);
++    }
++    else
++    {
++        if (cmd == VIDIOC_STREAMOFF)
++            flush_all_buffers_status(ctx);
++        else
++            ctx->first_buf = 1;
++
++        ctx->streamon = (cmd == VIDIOC_STREAMON);
++        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
++               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
++    }
++
++    // Both stream off & on effectively clear flag_last
++    ctx->flag_last = 0;
++
++    ff_mutex_unlock(&ctx->lock);
++
++    return ret;
+ }
+ 
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
++    int64_t track_ts;
+     V4L2Buffer* avbuf;
+     int ret;
+ 
+     if (!frame) {
+         ret = v4l2_stop_encode(ctx);
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
+         s->draining= 1;
+         return 0;
+     }
+@@ -601,23 +987,29 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
+     if (!avbuf)
+         return AVERROR(EAGAIN);
+ 
+-    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
++    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
++
++    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
+     if (ret)
+         return ret;
+ 
+     return ff_v4l2_buffer_enqueue(avbuf);
+ }
+ 
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
++                                   const void * extdata, size_t extlen)
+ {
+     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer* avbuf;
+     int ret;
++    int64_t track_ts;
+ 
+     if (!pkt->size) {
+         ret = v4l2_stop_decode(ctx);
++        // Log but otherwise ignore stop failure
+         if (ret)
+-            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
++            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
+         s->draining = 1;
+         return 0;
+     }
+@@ -626,8 +1018,13 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+     if (!avbuf)
+         return AVERROR(EAGAIN);
+ 
+-    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
+-    if (ret)
++    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
++
++    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
++    if (ret == AVERROR(ENOMEM))
++        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
++               __func__, pkt->size, avbuf->planes[0].length);
++    else if (ret)
+         return ret;
+ 
+     return ff_v4l2_buffer_enqueue(avbuf);
+@@ -635,42 +1032,77 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
+ 
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * timeout=-1 blocks until:
+-     *  1. decoded frame available
+-     *  2. an input buffer is ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv;
++        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
+ 
+-        return AVERROR(EAGAIN);
+-    }
+-
+-    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
++   return 0;
+ }
+ 
+-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
+ {
++    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    AVCodecContext *const avctx = s->avctx;
+     V4L2Buffer *avbuf;
++    int rv;
+ 
+-    /*
+-     * blocks until:
+-     *  1. encoded packet available
+-     *  2. an input buffer ready to be dequeued
+-     */
+-    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
+-    if (!avbuf) {
+-        if (ctx->done)
+-            return AVERROR_EOF;
++    do {
++        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
++            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
++        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
++            return rv;
++    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
+ 
+-        return AVERROR(EAGAIN);
++    return 0;
++}
++
++// Return 0 terminated list of drm fourcc video formats for this context
++// NULL if none found or error
++// Returned list is malloced so must be freed
++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN)
++{
++    unsigned int i;
++    unsigned int n = 0;
++    unsigned int size = 0;
++    uint32_t * e = NULL;
++    *pN = 0;
++
++    for (i = 0; i < 1024; ++i) {
++        struct v4l2_fmtdesc fdesc = {
++            .index = i,
++            .type = ctx->type
++        };
++
++        if (ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc))
++            return e;
++
++        if (n + 1 >= size) {
++            unsigned int newsize = (size == 0) ? 16 : size * 2;
++            uint32_t * t = av_realloc(e, newsize * sizeof(*t));
++            if (!t)
++                return e;
++            e = t;
++            size = newsize;
++        }
++
++        e[n] = fdesc.pixelformat;
++        e[++n] = 0;
++        if (pN)
++            *pN = n;
+     }
+ 
+-    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
++    // If we've looped 1024 times we are clearly confused
++    *pN = 0;
++    av_free(e);
++    return NULL;
+ }
+ 
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+@@ -702,78 +1134,179 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
+ 
+ int ff_v4l2_context_set_format(V4L2Context* ctx)
+ {
+-    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    int ret;
++
++    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    if (ret != 0)
++        return ret;
++
++    // Check returned size against min size and if smaller have another go
++    // Only worry about plane[0] as this is meant to enforce limits for
++    // encoded streams where we might know a bit more about the shape
++    // than the driver
++    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
++        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
++            return 0;
++        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
++    }
++    else {
++        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
++            return 0;
++        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
++    }
++
++    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
++    return ret;
+ }
+ 
+ void ff_v4l2_context_release(V4L2Context* ctx)
+ {
+     int ret;
+ 
+-    if (!ctx->buffers)
++    if (!ctx->bufrefs)
+         return;
+ 
+     ret = v4l2_release_buffers(ctx);
+     if (ret)
+         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
+ 
+-    av_freep(&ctx->buffers);
++    av_freep(&ctx->bufrefs);
++    av_buffer_unref(&ctx->frames_ref);
++
++    ff_mutex_destroy(&ctx->lock);
++    pthread_cond_destroy(&ctx->cond);
+ }
+ 
+-int ff_v4l2_context_init(V4L2Context* ctx)
++
++static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
+ {
+-    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
+     struct v4l2_requestbuffers req;
+-    int ret, i;
+-
+-    if (!v4l2_type_supported(ctx)) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
+-        return AVERROR_PATCHWELCOME;
+-    }
++    int ret;
++    int i;
+ 
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
+-    if (ret)
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
++    av_assert0(ctx->bufrefs == NULL);
+ 
+     memset(&req, 0, sizeof(req));
+-    req.count = ctx->num_buffers;
+-    req.memory = V4L2_MEMORY_MMAP;
++    req.count = req_buffers;
++    req.memory = mem;
+     req.type = ctx->type;
+-    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
+-    if (ret < 0) {
+-        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
+-        return AVERROR(errno);
++    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
++        if (errno != EINTR) {
++            ret = AVERROR(errno);
++            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
++            return ret;
++        }
+     }
+ 
+     ctx->num_buffers = req.count;
+-    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
+-    if (!ctx->buffers) {
++    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
++    if (!ctx->bufrefs) {
+         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
+-        return AVERROR(ENOMEM);
++        goto fail_release;
+     }
+ 
+-    for (i = 0; i < req.count; i++) {
+-        ctx->buffers[i].context = ctx;
+-        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
+-        if (ret < 0) {
++    ctx->wl_master = ff_weak_link_new(ctx);
++    if (!ctx->wl_master) {
++        ret = AVERROR(ENOMEM);
++        goto fail_release;
++    }
++
++    for (i = 0; i < ctx->num_buffers; i++) {
++        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
++        if (ret) {
+             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
+-            goto error;
++            goto fail_release;
+         }
+     }
+ 
+     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
+         req.count,
+-        v4l2_get_width(&ctx->format),
+-        v4l2_get_height(&ctx->format),
++        ff_v4l2_get_format_width(&ctx->format),
++        ff_v4l2_get_format_height(&ctx->format),
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
+         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
+ 
+     return 0;
+ 
+-error:
++fail_release:
+     v4l2_release_buffers(ctx);
++    av_freep(&ctx->bufrefs);
++    return ret;
++}
++
++int ff_v4l2_context_init(V4L2Context* ctx)
++{
++    struct v4l2_queryctrl qctrl;
++    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
++    int ret;
++
++    // It is not valid to reinit a context without a previous release
++    av_assert0(ctx->bufrefs == NULL);
++
++    if (!v4l2_type_supported(ctx)) {
++        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
++        return AVERROR_PATCHWELCOME;
++    }
+ 
+-    av_freep(&ctx->buffers);
++    ff_mutex_init(&ctx->lock, NULL);
++    pthread_cond_init(&ctx->cond, NULL);
++    atomic_init(&ctx->q_count, 0);
++
++    if (s->output_drm) {
++        AVHWFramesContext *hwframes;
++
++        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
++        if (!ctx->frames_ref) {
++            ret = AVERROR(ENOMEM);
++            goto fail_unlock;
++        }
++
++        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
++        hwframes->format = AV_PIX_FMT_DRM_PRIME;
++        hwframes->sw_format = ctx->av_pix_fmt;
++        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
++        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
++        ret = av_hwframe_ctx_init(ctx->frames_ref);
++        if (ret < 0)
++            goto fail_unref_hwframes;
++    }
++
++    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
++        goto fail_unref_hwframes;
++    }
++
++    memset(&qctrl, 0, sizeof(qctrl));
++    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
++    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
++        ret = AVERROR(errno);
++        if (ret != AVERROR(EINVAL)) {
++            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
++            goto fail_unref_hwframes;
++        }
++        // Control unsupported - set default if wanted
++        if (ctx->num_buffers < 2)
++            ctx->num_buffers = 4;
++    }
++    else {
++        if (ctx->num_buffers < 2)
++            ctx->num_buffers = qctrl.minimum + 2;
++        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
++    }
++
++    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
++    if (ret < 0)
++        goto fail_unref_hwframes;
++
++    return 0;
+ 
++fail_unref_hwframes:
++    av_buffer_unref(&ctx->frames_ref);
++fail_unlock:
++    ff_mutex_destroy(&ctx->lock);
+     return ret;
+ }
+diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
+index 22a9532444..0c8c020be1 100644
+--- a/libavcodec/v4l2_context.h
++++ b/libavcodec/v4l2_context.h
+@@ -31,6 +31,7 @@
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/frame.h"
+ #include "libavutil/buffer.h"
++#include "libavutil/thread.h"
+ #include "v4l2_buffers.h"
+ 
+ typedef struct V4L2Context {
+@@ -70,28 +71,57 @@ typedef struct V4L2Context {
+      */
+     int width, height;
+     AVRational sample_aspect_ratio;
++    struct v4l2_rect selection;
+ 
+     /**
+-     * Indexed array of V4L2Buffers
++     * If the default size of buffer is less than this then try to
++     * set to this.
+      */
+-    V4L2Buffer *buffers;
++    uint32_t min_buf_size;
++
++    /**
++     * Indexed array of pointers to V4L2Buffers
++     */
++    AVBufferRef **bufrefs;
+ 
+     /**
+      * Readonly after init.
+      */
+     int num_buffers;
+ 
++    /**
++     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
++     */
++    enum v4l2_memory buf_mem;
++
+     /**
+      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
+      */
+     int streamon;
+ 
++    /* 1st buffer after stream on */
++    int first_buf;
++
+     /**
+      *  Either no more buffers available or an unrecoverable error was notified
+      *  by the V4L2 kernel driver: once set the context has to be exited.
+      */
+     int done;
+ 
++    int flag_last;
++
++    /**
++     * If NZ then when Qing frame/pkt use this rather than the
++     * "real" PTS
++     */
++    uint64_t track_ts;
++
++    AVBufferRef *frames_ref;
++    atomic_int q_count;
++    struct ff_weak_link_master *wl_master;
++
++    AVMutex lock;
++    pthread_cond_t cond;
+ } V4L2Context;
+ 
+ /**
+@@ -119,6 +149,19 @@ int ff_v4l2_context_set_format(V4L2Context* ctx);
+  */
+ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe);
+ 
++/**
++ * Get the list of drm fourcc pixel formats for this context
++ *
++ * @param[in] ctx A pointer to a V4L2Context. See V4L2Context
++ *       description for required variables.
++ * @param[in] pN A pointer to receive the number of formats
++ *       found. May be NULL if not wanted.
++ * @return Pointer to malloced list of zero terminated formats,
++ *         NULL if none or error. As list is malloced it must be
++ *         freed.
++ */
++uint32_t * ff_v4l2_context_enum_drm_formats(V4L2Context *ctx, unsigned int *pN);
++
+ /**
+  * Releases a V4L2Context.
+  *
+@@ -147,7 +190,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
+  * @param[inout] pkt The AVPacket to dequeue to.
+  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
+  */
+-int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
++int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);
+ 
+ /**
+  * Dequeues a buffer from a V4L2Context to an AVFrame.
+@@ -156,7 +199,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
+  * @param[in] ctx The V4L2Context to dequeue from.
+  * @param[inout] f The AVFrame to dequeue to.
+  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
++ *
+  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
++ *                AVERROR(ENOSPC) if no buffer availible to put
++ *                the frame in
+  */
+ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+ 
+@@ -170,7 +216,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
+  * @param[in] pkt A pointer to an AVPacket.
+  * @return 0 in case of success, a negative error otherwise.
+  */
+-int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
++int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
+ 
+ /**
+  * Enqueues a buffer to a V4L2Context from an AVFrame
+@@ -183,4 +229,28 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
+  */
+ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
+ 
++/**
++ * Dequeue all buffers on this queue
++ *
++ * Used to recycle output buffers
++ *
++ * @param[in] ctx The V4L2Context to dequeue from.
++ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, 
++ *       all others have a timeout of zero
++ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
++ *         of the first dequeue operation, 0 otherwise.
++ */
++int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);
++
++/**
++ * Returns the number of buffers currently queued
++ *
++ * @param[in] ctx The V4L2Context to evaluate
++ */
++static inline int
++ff_v4l2_context_q_count(const V4L2Context* const ctx)
++{
++    return atomic_load(&ctx->q_count);
++}
++
+ #endif // AVCODEC_V4L2_CONTEXT_H
+diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
+index cdfd579810..143656e792 100644
+--- a/libavcodec/v4l2_m2m.c
++++ b/libavcodec/v4l2_m2m.c
+@@ -35,6 +35,15 @@
+ #include "v4l2_context.h"
+ #include "v4l2_fmt.h"
+ #include "v4l2_m2m.h"
++#include "v4l2_req_dmabufs.h"
++
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    x->last_pts = AV_NOPTS_VALUE;
++}
++
+ 
+ static inline int v4l2_splane_video(struct v4l2_capability *cap)
+ {
+@@ -68,7 +77,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+ 
+     s->capture.done = s->output.done = 0;
+     s->capture.name = "capture";
++    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     s->output.name = "output";
++    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
+     atomic_init(&s->refcount, 0);
+     sem_init(&s->refsync, 0, 0);
+ 
+@@ -85,18 +96,58 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
+     if (v4l2_mplane_video(&cap)) {
+         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
+         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        s->output.format.type = s->output.type;
+         return 0;
+     }
+ 
+     if (v4l2_splane_video(&cap)) {
+         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        s->output.format.type = s->output.type;
+         return 0;
+     }
+ 
+     return AVERROR(EINVAL);
+ }
+ 
++static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_format fmt = {.type = s->output.type};
++    int rv;
++    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
++    unsigned int w;
++    unsigned int h;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        fmt.fmt.pix_mp.pixelformat = pixfmt;
++        fmt.fmt.pix_mp.width = avctx->width;
++        fmt.fmt.pix_mp.height = avctx->height;
++    }
++    else {
++        fmt.fmt.pix.pixelformat = pixfmt;
++        fmt.fmt.pix.width = avctx->width;
++        fmt.fmt.pix.height = avctx->height;
++    }
++
++    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
++
++    if (rv != 0) {
++        rv = AVERROR(errno);
++        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
++        return rv;
++    }
++
++    w = ff_v4l2_get_format_width(&fmt);
++    h = ff_v4l2_get_format_height(&fmt);
++
++    if (w < avctx->width || h < avctx->height) {
++        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
+ static int v4l2_probe_driver(V4L2m2mContext *s)
+ {
+     void *log_ctx = s->avctx;
+@@ -116,6 +167,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
+         goto done;
+     }
+ 
++    // If being given frames (encode) check that V4L2 can cope with the size
++    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
++        (ret = check_size(s->avctx, s)) != 0)
++        goto done;
++
+     ret = ff_v4l2_context_get_format(&s->capture, 1);
+     if (ret) {
+         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
+@@ -215,13 +271,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
+ 
+     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
+-     *    we must wait for all references to be released before being allowed
+-     *    to queue new buffers.
+      */
+-    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
+-    if (atomic_load(&s->refcount))
+-        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
+-
+     ff_v4l2_context_release(&s->capture);
+ 
+     /* 3. get the new capture format */
+@@ -240,7 +290,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
+ 
+     /* 5. complete reinit */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     return 0;
+ }
+@@ -274,7 +323,6 @@ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *s)
+ 
+     /* start again now that we know the stream dimensions */
+     s->draining = 0;
+-    s->reinit = 0;
+ 
+     ret = ff_v4l2_context_get_format(&s->output, 0);
+     if (ret) {
+@@ -328,10 +376,14 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
+     ff_v4l2_context_release(&s->capture);
+     sem_destroy(&s->refsync);
+ 
+-    close(s->fd);
++    if (s->fd != -1)
++        close(s->fd);
+     av_frame_unref(s->frame);
+     av_frame_free(&s->frame);
+     av_packet_unref(&s->buf_pkt);
++    av_freep(&s->extdata_data);
++
++    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
+ 
+     av_free(s);
+ }
+@@ -344,6 +396,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     if (!s)
+         return 0;
+ 
++    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
++
++    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
++        av_packet_unref(&s->buf_pkt);
++
+     if (s->fd >= 0) {
+         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
+         if (ret)
+@@ -355,8 +412,20 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
+     }
+ 
+     ff_v4l2_context_release(&s->output);
++    av_buffer_unref(&s->device_ref);
++
++    dmabufs_ctl_unref(&s->db_ctl);
++
++    if (s->fd != -1) {
++        close(s->fd);
++        s->fd = -1;
++    }
+ 
+     s->self_ref = NULL;
++    // This is only called on avctx close so after this point we don't have that
++    // Crash sooner if we find we are using it (can still log with avctx = NULL)
++    s->avctx = NULL;
++    priv->context = NULL;
+     av_buffer_unref(&priv->context_ref);
+ 
+     return 0;
+@@ -400,35 +469,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
+     return v4l2_configure_contexts(s);
+ }
+ 
+-int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
++int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
+ {
+-    *s = av_mallocz(sizeof(V4L2m2mContext));
+-    if (!*s)
++    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
++
++    *pps = NULL;
++    if (!s)
+         return AVERROR(ENOMEM);
+ 
+-    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
++    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
+                                          &v4l2_m2m_destroy_context, NULL, 0);
+     if (!priv->context_ref) {
+-        av_freep(s);
++        av_free(s);
+         return AVERROR(ENOMEM);
+     }
+ 
+     /* assign the context */
+-    priv->context = *s;
+-    (*s)->priv = priv;
++    priv->context = s;
++    s->priv = priv;
+ 
+     /* populate it */
+-    priv->context->capture.num_buffers = priv->num_capture_buffers;
+-    priv->context->output.num_buffers  = priv->num_output_buffers;
+-    priv->context->self_ref = priv->context_ref;
+-    priv->context->fd = -1;
++    s->capture.num_buffers = priv->num_capture_buffers;
++    s->output.num_buffers  = priv->num_output_buffers;
++    s->self_ref = priv->context_ref;
++    s->fd = -1;
++    xlat_init(&s->xlat);
+ 
+     priv->context->frame = av_frame_alloc();
+     if (!priv->context->frame) {
+         av_buffer_unref(&priv->context_ref);
+-        *s = NULL; /* freed when unreferencing context_ref */
+         return AVERROR(ENOMEM);
+     }
+ 
++    *pps = s;
+     return 0;
+ }
+diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
+index b67b216331..a506e69d67 100644
+--- a/libavcodec/v4l2_m2m.h
++++ b/libavcodec/v4l2_m2m.h
+@@ -30,6 +30,7 @@
+ #include <linux/videodev2.h>
+ 
+ #include "libavcodec/avcodec.h"
++#include "libavutil/pixfmt.h"
+ #include "v4l2_context.h"
+ 
+ #define container_of(ptr, type, member) ({ \
+@@ -38,7 +39,39 @@
+ 
+ #define V4L_M2M_DEFAULT_OPTS \
+     { "num_output_buffers", "Number of buffers in the output context",\
+-        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 6, INT_MAX, FLAGS }
++        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
++
++#define FF_V4L2_M2M_TRACK_SIZE 128
++typedef struct V4L2m2mTrackEl {
++    int     discard;   // If we see this buffer its been flushed, so discard
++    int     pending;
++    int     pkt_size;
++    int64_t pts;
++    int64_t dts;
++    int64_t reordered_opaque;
++    int64_t pkt_pos;
++    int64_t pkt_duration;
++    int64_t track_pts;
++} V4L2m2mTrackEl;
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++    int64_t guess;
++} pts_stats_t;
++
++typedef struct xlat_track_s {
++    unsigned int track_no;
++    int64_t last_pts;    // Last valid PTS decoded
++    int64_t last_opaque;
++    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
++} xlat_track_t;
++
++struct dmabufs_ctl;
+ 
+ typedef struct V4L2m2mContext {
+     char devname[PATH_MAX];
+@@ -52,10 +85,10 @@ typedef struct V4L2m2mContext {
+     AVCodecContext *avctx;
+     sem_t refsync;
+     atomic_uint refcount;
+-    int reinit;
+ 
+     /* null frame/packet received */
+     int draining;
++    int running;
+     AVPacket buf_pkt;
+ 
+     /* Reference to a frame. Only used during encoding */
+@@ -66,6 +99,36 @@ typedef struct V4L2m2mContext {
+ 
+     /* reference back to V4L2m2mPriv */
+     void *priv;
++
++    AVBufferRef *device_ref;
++
++    /* generate DRM frames */
++    int output_drm;
++
++    /* input frames are drmprime */
++    int input_drm;
++
++    /* Frame tracking */
++    xlat_track_t xlat;
++
++    pts_stats_t pts_stat;
++
++    /* req pkt */
++    int req_pkt;
++    int reorder_size;
++
++    /* Ext data sent */
++    int extdata_sent;
++    /* Ext data sent in packet - overrides ctx */
++    void * extdata_data;
++    size_t extdata_size;
++
++#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
++#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
++    /* Quirks */
++    unsigned int quirks;
++
++    struct dmabufs_ctl * db_ctl;
+ } V4L2m2mContext;
+ 
+ typedef struct V4L2m2mPriv {
+@@ -76,6 +139,8 @@ typedef struct V4L2m2mPriv {
+ 
+     int num_output_buffers;
+     int num_capture_buffers;
++    const char * dmabuf_alloc;
++    enum AVPixelFormat pix_fmt;
+ } V4L2m2mPriv;
+ 
+ /**
+@@ -129,4 +194,26 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
+  */
+ int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
+ 
++
++static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
++{
++    return ctx->flag_last;
++}
++
++
+ #endif /* AVCODEC_V4L2_M2M_H */
+diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
+index ab07c0a24a..e7fd8980e5 100644
+--- a/libavcodec/v4l2_m2m_dec.c
++++ b/libavcodec/v4l2_m2m_dec.c
+@@ -21,8 +21,14 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config.h"
++
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
++
++#include "libavutil/avassert.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
+ #include "libavutil/pixfmt.h"
+ #include "libavutil/pixdesc.h"
+ #include "libavutil/opt.h"
+@@ -30,75 +36,279 @@
+ #include "libavcodec/decode.h"
+ #include "libavcodec/internal.h"
+ 
++#include "libavcodec/hwaccels.h"
++#include "libavcodec/internal.h"
++#include "libavcodec/hwconfig.h"
++
+ #include "v4l2_context.h"
+ #include "v4l2_m2m.h"
+ #include "v4l2_fmt.h"
++#include "v4l2_req_dmabufs.h"
+ 
+-static int v4l2_try_start(AVCodecContext *avctx)
++#if CONFIG_H264_DECODER
++#include "h264_parse.h"
++#endif
++#if CONFIG_HEVC_DECODER
++#include "hevc_parse.h"
++#endif
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++
++#ifndef FF_API_BUFFER_SIZE_T
++#define FF_API_BUFFER_SIZE_T 1
++#endif
++
++#define DUMP_FAILED_EXTRADATA 0
++
++#if DUMP_FAILED_EXTRADATA
++static inline char hex1(unsigned int x)
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+-    struct v4l2_selection selection = { 0 };
+-    int ret;
++    x &= 0xf;
++    return x <= 9 ? '0' + x : 'a' + x - 10;
++}
+ 
+-    /* 1. start the output process */
+-    if (!output->streamon) {
+-        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
+-        if (ret < 0) {
+-            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
+-            return ret;
+-        }
++static inline char * hex2(char * s, unsigned int x)
++{
++    *s++ = hex1(x >> 4);
++    *s++ = hex1(x);
++    return s;
++}
++
++static inline char * hex4(char * s, unsigned int x)
++{
++    s = hex2(s, x >> 8);
++    s = hex2(s, x);
++    return s;
++}
++
++static inline char * dash2(char * s)
++{
++    *s++ = '-';
++    *s++ = '-';
++    return s;
++}
++
++static void
++data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
++{
++    size_t i;
++    s = hex4(s, offset);
++    m += offset;
++    for (i = 0; i != 8; ++i) {
++        *s++ = ' ';
++        s = len > i + offset ? hex2(s, *m++) : dash2(s);
++    }
++    *s++ = ' ';
++    *s++ = ':';
++    for (; i != 16; ++i) {
++        *s++ = ' ';
++        s = len > i + offset ? hex2(s, *m++) : dash2(s);
+     }
++    *s++ = 0;
++}
+ 
+-    if (capture->streamon)
+-        return 0;
++static void
++log_dump(void * logctx, int lvl, const void * const data, const size_t len)
++{
++    size_t i;
++    for (i = 0; i < len; i += 16) {
++        char buf[80];
++        data16(buf, i, data, len);
++        av_log(logctx, lvl, "%s\n", buf);
++    }
++}
++#endif
+ 
+-    /* 2. get the capture format */
+-    capture->format.type = capture->type;
+-    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
+-        return ret;
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
++{
++    if (stats->last_count <= 1)
++        return stats->last_pts;
++    if (stats->last_pts == AV_NOPTS_VALUE ||
++            fail_bad_guess && (stats->last_interval == 0 ||
++                               stats->last_count >= STATS_LAST_COUNT_MAX))
++        return AV_NOPTS_VALUE;
++    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
++}
++
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
+     }
+ 
+-    /* 2.1 update the AVCodecContext */
+-    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
+-    capture->av_pix_fmt = avctx->pix_fmt;
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
+ 
+-    /* 3. set the crop parameters */
+-    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+-    selection.r.height = avctx->coded_height;
+-    selection.r.width = avctx->coded_width;
+-    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
+-    if (!ret) {
+-        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
+-        } else {
+-            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
+-            /* update the size of the resulting frame */
+-            capture->height = selection.r.height;
+-            capture->width  = selection.r.width;
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
+         }
+     }
+ 
+-    /* 4. init the capture context now that we have the capture format */
+-    if (!capture->buffers) {
+-        ret = ff_v4l2_context_init(capture);
+-        if (ret) {
+-            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
+-            return AVERROR(ENOMEM);
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
++// If abdata == NULL then this just counts space required
++// Unpacks avcC if detected
++static int
++h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
++{
++    const uint8_t * const xdend = extradata + extrasize;
++    const uint8_t * p = extradata;
++    uint8_t * d = abdata;
++    unsigned int n;
++    unsigned int len;
++    const unsigned int hdrlen = 4;
++    unsigned int need_pps = 1;
++
++    if (extrasize < 8)
++        return AVERROR(EINVAL);
++
++    if (p[0] == 0 && p[1] == 0) {
++        // Assume a couple of leading zeros are good enough to indicate NAL
++        if (abdata)
++            memcpy(d, p, extrasize);
++        return extrasize;
++    }
++
++    // avcC starts with a 1
++    if (p[0] != 1)
++        return AVERROR(EINVAL);
++
++    p += 5;
++    n = *p++ & 0x1f;
++
++doxps:
++    while (n--) {
++        if (xdend - p < 2)
++            return AVERROR(EINVAL);
++        len = (p[0] << 8) | p[1];
++        p += 2;
++        if (xdend - p < (ptrdiff_t)len)
++            return AVERROR(EINVAL);
++        if (abdata) {
++            d[0] = 0;
++            d[1] = 0;
++            d[2] = 0;
++            d[3] = 1;
++            memcpy(d + 4, p, len);
+         }
++        d += len + hdrlen;
++        p += len;
++    }
++    if (need_pps) {
++        need_pps = 0;
++        if (p >= xdend)
++            return AVERROR(EINVAL);
++        n = *p++;
++        goto doxps;
+     }
+ 
+-    /* 5. start the capture process */
+-    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
+-    if (ret) {
+-        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
++    return d - abdata;
++}
++
++static int
++copy_extradata(AVCodecContext * const avctx,
++               const void * const src_data, const int src_len,
++               void ** const pdst_data, size_t * const pdst_len)
++{
++    int len;
++
++    *pdst_len = 0;
++    av_freep(pdst_data);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264)
++        len = h264_xd_copy(src_data, src_len, NULL);
++    else
++        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
++
++    // Zero length is OK but we want to stop - -ve is error val
++    if (len <= 0)
++        return len;
++
++    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
++        return AVERROR(ENOMEM);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264)
++        h264_xd_copy(src_data, src_len, *pdst_data);
++    else
++        memcpy(*pdst_data, src_data, len);
++    *pdst_len = len;
++
++    return 0;
++}
++
++
++
++static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++    int ret;
++    struct v4l2_decoder_cmd cmd = {
++        .cmd = V4L2_DEC_CMD_START,
++        .flags = 0,
++    };
++
++    if (s->output.streamon)
++        return 0;
++
++    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
++    if (ret != 0) {
++        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
+         return ret;
+     }
+ 
++    // STREAMON should do implicit START so this just for those that don't.
++    // It is optional so don't worry if it fails
++    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
++        ret = AVERROR(errno);
++        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
++    }
++    else {
++        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
++    }
++    return 0;
++}
++
++static int v4l2_try_start(AVCodecContext *avctx)
++{
++    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int ret;
++
++    /* 1. start the output process */
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
+     return 0;
+ }
+ 
+@@ -133,46 +343,822 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
+     return 0;
+ }
+ 
+-static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++static void
++set_best_effort_pts(AVCodecContext *const avctx,
++             pts_stats_t * const ps,
++             AVFrame *const frame)
++{
++    pts_stats_add(ps, frame->pts);
++
++#if FF_API_PKT_PTS
++FF_DISABLE_DEPRECATION_WARNINGS
++    frame->pkt_pts = frame->pts;
++FF_ENABLE_DEPRECATION_WARNINGS
++#endif
++    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
++    // If we can't guess from just PTS - try DTS
++    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
++        frame->best_effort_timestamp = frame->pkt_dts;
++
++    // We can't emulate what s/w does in a useful manner and using the
++    // "correct" answer seems to just confuse things.
++    frame->pkt_dts               = frame->pts;
++    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
++           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
++}
++
++static void
++xlat_flush(xlat_track_t * const x)
++{
++    unsigned int i;
++    // Do not reset track_no - this ensures that any frames left in the decoder
++    // that turn up later get discarded.
++
++    x->last_pts = AV_NOPTS_VALUE;
++    x->last_opaque = 0;
++    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
++        x->track_els[i].pending = 0;
++        x->track_els[i].discard = 1;
++    }
++}
++
++static void
++xlat_init(xlat_track_t * const x)
++{
++    memset(x, 0, sizeof(*x));
++    xlat_flush(x);
++}
++
++static int
++xlat_pending(const V4L2m2mContext * const s)
++{
++    const xlat_track_t *const x = &s->xlat;
++    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
++    int i;
++    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
++    int64_t first_dts = AV_NOPTS_VALUE;
++    int no_dts_count = 0;
++    unsigned int interval = pts_stats_interval(&s->pts_stat);
++
++    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
++        const V4L2m2mTrackEl * const t = x->track_els + n;
++
++        if (first_dts == AV_NOPTS_VALUE)
++            if (t->dts == AV_NOPTS_VALUE)
++                ++no_dts_count;
++            else
++                first_dts = t->dts;
++
++        // Discard only set on never-set or flushed entries
++        // So if we get here we've never successfully decoded a frame so allow
++        // more frames into the buffer before stalling
++        if (t->discard)
++            return i - 16;
++
++        // If we've got this frame out then everything before this point
++        // must have entered the decoder
++        if (!t->pending)
++            break;
++
++        // If we've never seen a pts all we can do is count frames
++        if (now == AV_NOPTS_VALUE)
++            continue;
++
++        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
++            break;
++    }
++
++    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
++        const int iframes = (first_dts - now) / (int)interval;
++        const int t = iframes - s->reorder_size + no_dts_count;
++
++//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
++//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
++
++        if (iframes > 0 && iframes < 64 && t < i) {
++            return t;
++        }
++    }
++
++    return i;
++}
++
++static inline int stream_started(const V4L2m2mContext * const s) {
++    return s->output.streamon;
++}
++
++#define NQ_OK        0
++#define NQ_Q_FULL    1
++#define NQ_SRC_EMPTY 2
++#define NQ_NONE      3
++#define NQ_DRAINING  4
++#define NQ_DEAD      5
++
++#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
++#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
++
++// do_not_get      If true then no new packet will be got but status will
++//                  be set appropriately
++
++// AVERROR_EOF     Flushing an already flushed stream
++// -ve             Error (all errors except EOF are unexpected)
++// NQ_OK (0)       OK
++// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
++// NQ_SRC_EMPTY    Src empty (do not retry)
++// NQ_NONE         Enqueue not attempted
++// NQ_DRAINING     At EOS, dQ dest until EOS there too
++// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
++
++static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
+ {
+-    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+-    V4L2Context *const capture = &s->capture;
+-    V4L2Context *const output = &s->output;
+     int ret;
+ 
+-    if (!s->buf_pkt.size) {
+-        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
+-        if (ret < 0 && ret != AVERROR_EOF)
++    // If we don't already have a coded packet - get a new one
++    // We will already have a coded pkt if the output Q was full last time we
++    // tried to Q it
++    if (!s->buf_pkt.size && !do_not_get) {
++        unsigned int i;
++
++        for (i = 0; i < 256; ++i) {
++            uint8_t * side_data;
++#if FF_API_BUFFER_SIZE_T
++            int side_size;
++#else
++            size_t side_size;
++#endif
++            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
++            if (ret != 0)
++                break;
++
++            // New extradata is the only side-data we undertand
++            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
++            if (side_data) {
++                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
++                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
++                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
++                s->extdata_sent = 0;
++            }
++
++            if (s->buf_pkt.size != 0)
++                break;
++
++            if (s->buf_pkt.side_data_elems == 0) {
++                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
++                ret = AVERROR_EOF;
++                break;
++            }
++
++            // Retry a side-data only pkt
++        }
++        // If i >= 256 something has gone wrong
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
++            return AVERROR(EIO);
++        }
++
++        if (ret == AVERROR(EAGAIN)) {
++            if (!stream_started(s)) {
++                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
++                return NQ_DEAD;
++            }
++            return NQ_SRC_EMPTY;
++        }
++
++        if (ret == AVERROR_EOF) {
++            // EOF - enter drain mode
++            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
++                   ret, s->buf_pkt.size, stream_started(s), s->draining);
++            if (!stream_started(s)) {
++                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
++                s->draining = 1;
++                s->capture.done = 1;
++                return AVERROR_EOF;
++            }
++
++            if (!s->draining) {
++                // Calling enqueue with an empty pkt starts drain
++                av_assert0(s->buf_pkt.size == 0);
++                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++                if (ret) {
++                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
++                    return ret;
++                }
++            }
++            return NQ_DRAINING;
++        }
++
++        if (ret < 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
+             return ret;
++        }
++    }
++
++    if (s->draining) {
++        if (s->buf_pkt.size) {
++            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
++            av_packet_unref(&s->buf_pkt);
++        }
++        return NQ_DRAINING;
+     }
+ 
+-    if (s->draining)
+-        goto dequeue;
++    if (!s->buf_pkt.size)
++        return NQ_NONE;
+ 
+-    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
+-    if (ret < 0 && ret != AVERROR(EAGAIN))
+-        goto fail;
++    if ((ret = check_output_streamon(avctx, s)) != 0)
++        return ret;
++
++    if (s->extdata_sent)
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
++    else
++        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
+ 
+-    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
+-    if (ret != AVERROR(EAGAIN))
++    if (ret == AVERROR(EAGAIN)) {
++        // Out of input buffers - keep packet
++        ret = NQ_Q_FULL;
++    }
++    else {
++        // In all other cases we are done with this packet
+         av_packet_unref(&s->buf_pkt);
++        s->extdata_sent = 1;
+ 
+-    if (!s->draining) {
+-        ret = v4l2_try_start(avctx);
+         if (ret) {
+-            /* cant recover */
+-            if (ret != AVERROR(ENOMEM))
+-                ret = 0;
+-            goto fail;
++            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
++            return ret;
+         }
+     }
+ 
+-dequeue:
+-    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
+-fail:
+-    av_packet_unref(&s->buf_pkt);
++    // Start if we haven't
++    {
++        const int ret2 = v4l2_try_start(avctx);
++        if (ret2) {
++            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
++            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
++        }
++    }
++
++    return ret;
++}
++
++static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
++{
++    int rv = 0;
++
++    ff_mutex_lock(&ctx->lock);
++
++    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
++        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
++            rv = AVERROR(errno);
++            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
++            break;
++        }
++    }
++
++    ff_mutex_unlock(&ctx->lock);
++    return rv;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
++    int src_rv = -1;
++    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
++    unsigned int i = 0;
++
++    do {
++        const int pending = xlat_pending(s);
++        const int prefer_dq = (pending > 4);
++        const int last_src_rv = src_rv;
++
++        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
++
++        // Enqueue another pkt for decode if
++        // (a) We don't have a lot of stuff in the buffer already OR
++        // (b) ... we (think we) do but we've failed to get a frame already OR
++        // (c) We've dequeued a lot of frames without asking for input
++        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
++
++        // If we got a frame last time or we've already tried to get a frame and
++        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
++        // indicating that we want more input.
++        // This should mean that once decode starts we enter a stable state where
++        // we alternately ask for input and produce output
++        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
++            break;
++
++        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
++            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
++            break;
++        }
++
++        // Try to get a new frame if
++        // (a) we haven't already got one AND
++        // (b) enqueue returned a status indicating that decode should be attempted
++        if (dst_rv != 0 && TRY_DQ(src_rv)) {
++            // Pick a timeout depending on state
++            // The pending count isn't completely reliable so it is good enough
++            // hint that we want a frame but not good enough to require it in
++            // all cases; however if it has got > 31 that exceeds its margin of
++            // error so require a frame to prevent ridiculous levels of latency
++            const int t =
++                src_rv == NQ_Q_FULL ? -1 :
++                src_rv == NQ_DRAINING ? 300 :
++                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;
++
++            // Dequeue frame will unref any previous contents of frame
++            // if it returns success so we don't need an explicit unref
++            // when discarding
++            // This returns AVERROR(EAGAIN) on timeout or if
++            // there is room in the input Q and timeout == -1
++            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++
++            // Failure due to no buffer in Q?
++            if (dst_rv == AVERROR(ENOSPC)) {
++                // Wait & retry
++                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
++                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
++                }
++            }
++
++            if (dst_rv == 0) {
++                set_best_effort_pts(avctx, &s->pts_stat, frame);
++                if (!s->running) {
++                    s->running = 1;
++                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
++                }
++            }
++
++            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
++                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
++                dst_rv = AVERROR_EOF;
++                s->capture.done = 1;
++            }
++            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
++                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
++                       s->draining, s->capture.done);
++            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
++                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
++                       s->draining, s->capture.done, dst_rv);
++        }
++
++        ++i;
++        if (i >= 256) {
++            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
++            src_rv = AVERROR(EIO);
++        }
++
++        // Continue trying to enqueue packets if either
++        // (a) we succeeded last time OR
++        // (b) we didn't ret a frame and we can retry the input
++    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
++
++    // Ensure that the frame contains nothing if we aren't returning a frame
++    // (might happen when discarding)
++    if (dst_rv)
++        av_frame_unref(frame);
++
++    // If we got a frame this time ask for a pkt next time
++    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
++
++#if 0
++    if (dst_rv == 0)
++    {
++        static int z = 0;
++        if (++z > 50) {
++            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
++            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
++            return -1;
++        }
++    }
++#endif
++
++    return dst_rv == 0 ? 0 :
++        src_rv < 0 ? src_rv :
++        dst_rv < 0 ? dst_rv :
++            AVERROR(EAGAIN);
++}
++
++#if 0
++#include <time.h>
++static int64_t us_time(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
++}
++
++static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
++{
++    int ret;
++    const int64_t now = us_time();
++    int64_t done;
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++    ret = v4l2_receive_frame2(avctx, frame);
++    done = us_time();
++    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
+     return ret;
+ }
++#endif
++
++static uint32_t
++avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
++{
++    switch (codec_id) {
++        case AV_CODEC_ID_H264:
++            switch (avprofile) {
++                case FF_PROFILE_H264_BASELINE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
++                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
++                case FF_PROFILE_H264_MAIN:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
++                case FF_PROFILE_H264_EXTENDED:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
++                case FF_PROFILE_H264_HIGH:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
++                case FF_PROFILE_H264_HIGH_10:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
++                case FF_PROFILE_H264_HIGH_10_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
++                case FF_PROFILE_H264_MULTIVIEW_HIGH:
++                case FF_PROFILE_H264_HIGH_422:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
++                case FF_PROFILE_H264_HIGH_422_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
++                case FF_PROFILE_H264_STEREO_HIGH:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
++                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
++                case FF_PROFILE_H264_HIGH_444_INTRA:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
++                case FF_PROFILE_H264_CAVLC_444:
++                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
++                case FF_PROFILE_H264_HIGH_444:
++                default:
++                    break;
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
++//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
++            }
++            break;
++        case AV_CODEC_ID_MPEG2VIDEO:
++        case AV_CODEC_ID_MPEG4:
++        case AV_CODEC_ID_VC1:
++        case AV_CODEC_ID_VP8:
++        case AV_CODEC_ID_VP9:
++        case AV_CODEC_ID_AV1:
++            // Most profiles are a simple number that matches the V4L2 enum
++            return avprofile;
++        default:
++            break;
++    }
++    return ~(uint32_t)0;
++}
++
++// This check mirrors Chrome's profile check by testing to see if the profile
++// exists as a possible value for the V4L2 profile control
++static int
++check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
++{
++    struct v4l2_queryctrl query_ctrl;
++    struct v4l2_querymenu query_menu;
++    uint32_t profile_id;
++
++    // An unset profile is almost certainly zero or -99 - do not reject
++    if (avctx->profile <= 0) {
++        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
++        return 0;
++    }
++
++    memset(&query_ctrl, 0, sizeof(query_ctrl));
++    switch (avctx->codec_id) {
++        case AV_CODEC_ID_MPEG2VIDEO:
++            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
++            break;
++        case AV_CODEC_ID_MPEG4:
++            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
++            break;
++        case AV_CODEC_ID_H264:
++            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
++            break;
++        case AV_CODEC_ID_VP8:
++            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
++            break;
++        case AV_CODEC_ID_VP9:
++            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
++            break;
++#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
++        case AV_CODEC_ID_AV1:
++            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
++            break;
++#endif
++        default:
++            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
++            return 0;
++    }
++
++    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
++    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
++        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
++    }
++    else {
++        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
++
++        query_menu = (struct v4l2_querymenu){
++            .id = query_ctrl.id,
++            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
++        };
++
++        if (query_menu.index > query_ctrl.maximum ||
++            query_menu.index < query_ctrl.minimum ||
++            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
++            return AVERROR(ENOENT);
++        }
++    }
++
++    return 0;
++};
++
++static int
++check_size(AVCodecContext * const avctx, V4L2m2mContext * const s, const uint32_t fcc)
++{
++    unsigned int i;
++    const uint32_t w = avctx->coded_width;
++    const uint32_t h = avctx->coded_height;
++
++    if (w == 0 || h == 0 || fcc == 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
++        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
++        return 0;
++    }
++
++    for (i = 0;; ++i) {
++        struct v4l2_frmsizeenum fs = {
++            .index = i,
++            .pixel_format = fcc,
++        };
++
++        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
++            const int err = AVERROR(errno);
++            if (err == AVERROR(EINTR))
++                continue;
++            if (i == 0 && err == AVERROR(ENOTTY)) {
++                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
++                return 0;
++            }
++            if (err != AVERROR(EINVAL)) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
++                return err;
++            }
++            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
++                   w, h, av_fourcc2str(fcc), i);
++            return err;
++        }
++
++        switch (fs.type) {
++            case V4L2_FRMSIZE_TYPE_DISCRETE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
++                       fs.discrete.width,fs.discrete.height);
++                if (w == fs.discrete.width && h == fs.discrete.height)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_STEPWISE:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
++                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
++                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
++                    return 0;
++                break;
++            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
++                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
++                       fs.stepwise.min_width, fs.stepwise.min_height,
++                       fs.stepwise.max_width, fs.stepwise.max_height,
++                       fs.stepwise.step_width,fs.stepwise.step_height);
++                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
++                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
++                    return 0;
++                break;
++            default:
++                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
++                return AVERROR(EINVAL);
++        }
++    }
++}
++
++static int
++get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    struct v4l2_capability cap;
++
++    memset(&cap, 0, sizeof(cap));
++    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
++        int err = errno;
++        if (err == EINTR)
++            continue;
++        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
++        return AVERROR(err);
++    }
++
++    // Could be made table driven if we have a few more but right now there
++    // seems no point
++
++    // Meson (amlogic) always gives a resolution changed event after output
++    // streamon and userspace must (re)allocate capture buffers and streamon
++    // capture to clear the event even if the capture buffers were the right
++    // size in the first place.
++    if (strcmp(cap.driver, "meson-vdec") == 0)
++        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
++
++    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
++    return 0;
++}
++
++// This heuristic is for H264 but use for everything
++static uint32_t max_coded_size(const AVCodecContext * const avctx)
++{
++    uint32_t wxh = avctx->coded_width * avctx->coded_height;
++    uint32_t size;
++
++    size = wxh * 3 / 2;
++    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
++    // unfortunately that doesn't yield an actually useful limit
++    // and it should be noted that frame 0 is special cased to allow
++    // a bigger number which really isn't helpful for us. So just pick
++    // frame_size / 2
++    size /= 2;
++    // Add 64k to allow for any overheads and/or encoder hopefulness
++    // with small WxH
++    return size + (1 << 16);
++}
++
++static void
++parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    s->reorder_size = 0;
++
++    if (!avctx->extradata || !avctx->extradata_size)
++        return;
++
++    switch (avctx->codec_id) {
++#if CONFIG_H264_DECODER
++        case AV_CODEC_ID_H264:
++        {
++            H264ParamSets ps;
++            int is_avc = 0;
++            int nal_length_size = 0;
++            int ret;
++
++            memset(&ps, 0, sizeof(ps));
++
++            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
++                                           &ps, &is_avc, &nal_length_size,
++                                           avctx->err_recognition, avctx);
++            if (ret > 0) {
++                const SPS * sps = NULL;
++                unsigned int i;
++                for (i = 0; i != MAX_SPS_COUNT; ++i) {
++                    if (ps.sps_list[i]) {
++                        sps = (const SPS *)ps.sps_list[i]->data;
++                        break;
++                    }
++                }
++                if (sps) {
++                    avctx->profile = ff_h264_get_profile(sps);
++                    avctx->level = sps->level_idc;
++                    s->reorder_size = sps->num_reorder_frames;
++                }
++            }
++            ff_h264_ps_uninit(&ps);
++            break;
++        }
++#endif
++#if CONFIG_HEVC_DECODER
++        case AV_CODEC_ID_HEVC:
++        {
++            HEVCParamSets ps;
++            HEVCSEI sei;
++            int is_nalff = 0;
++            int nal_length_size = 0;
++            int ret;
++
++            memset(&ps, 0, sizeof(ps));
++            memset(&sei, 0, sizeof(sei));
++
++            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
++                                           &ps, &sei, &is_nalff, &nal_length_size,
++                                           avctx->err_recognition, 0, avctx);
++            if (ret > 0) {
++                const HEVCSPS * sps = NULL;
++                unsigned int i;
++                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
++                    if (ps.sps_list[i]) {
++                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
++                        break;
++                    }
++                }
++                if (sps) {
++                    avctx->profile = sps->ptl.general_ptl.profile_idc;
++                    avctx->level   = sps->ptl.general_ptl.level_idc;
++                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
++                }
++            }
++            ff_hevc_ps_uninit(&ps);
++            ff_hevc_reset_sei(&sei);
++            break;
++        }
++#endif
++        default:
++            break;
++    }
++}
++
++static int
++choose_capture_format(AVCodecContext * const avctx, V4L2m2mContext * const s)
++{
++    const V4L2m2mPriv * const priv = avctx->priv_data;
++    unsigned int fmts_n;
++    uint32_t *fmts = ff_v4l2_context_enum_drm_formats(&s->capture, &fmts_n);
++    enum AVPixelFormat *fmts2 = NULL;
++    enum AVPixelFormat t;
++    enum AVPixelFormat gf_pix_fmt;
++    unsigned int i;
++    unsigned int n = 0;
++    unsigned int pref_n = 1;
++    int rv = AVERROR(ENOENT);
++
++    if (!fmts)
++        return AVERROR(ENOENT);
++
++    if ((fmts2 = av_malloc(sizeof(*fmts2) * (fmts_n + 2))) == NULL) {
++        rv = AVERROR(ENOMEM);
++        goto error;
++    }
++
++    // Filter for formats that are supported by ffmpeg and
++    // can accomodate the stream size
++    fmts2[n++] = AV_PIX_FMT_DRM_PRIME;
++    for (i = 0; i != fmts_n; ++i) {
++        const enum AVPixelFormat f = ff_v4l2_format_v4l2_to_avfmt(fmts[i], AV_CODEC_ID_RAWVIDEO);
++        if (f == AV_PIX_FMT_NONE)
++            continue;
++
++        if (check_size(avctx, s, fmts[i]) != 0)
++            continue;
++
++        if (f == priv->pix_fmt)
++            pref_n = n;
++        fmts2[n++] = f;
++    }
++    fmts2[n] = AV_PIX_FMT_NONE;
++
++    if (n < 2) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: No usable formats found\n", __func__);
++        goto error;
++    }
++
++    // Put preferred s/w format at the end - ff_get_format will put it in sw_pix_fmt
++    t = fmts2[n - 1];
++    fmts2[n - 1] = fmts2[pref_n];
++    fmts2[pref_n] = t;
++
++    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
++    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
++           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
++           avctx->coded_width, avctx->coded_height,
++           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
++
++    if (gf_pix_fmt == AV_PIX_FMT_NONE)
++        goto error;
++
++    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
++        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
++        s->capture.av_pix_fmt = avctx->sw_pix_fmt;
++        s->output_drm = 1;
++    }
++    else {
++        avctx->pix_fmt = gf_pix_fmt;
++        s->capture.av_pix_fmt = gf_pix_fmt;
++        s->output_drm = 0;
++    }
++
++    // Get format converts capture.av_pix_fmt back into a V4L2 format in the context
++    if ((rv = ff_v4l2_context_get_format(&s->capture, 0)) != 0)
++        goto error;
++    rv = ff_v4l2_context_set_format(&s->capture);
++
++error:
++    av_free(fmts2);
++    av_free(fmts);
++    return rv;
++}
+ 
+ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+ {
+@@ -181,10 +1167,27 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+     V4L2m2mPriv *priv = avctx->priv_data;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    if (avctx->codec_id == AV_CODEC_ID_H264) {
++        if (avctx->ticks_per_frame == 1) {
++            if(avctx->time_base.den < INT_MAX/2) {
++                avctx->time_base.den *= 2;
++            } else
++                avctx->time_base.num /= 2;
++        }
++        avctx->ticks_per_frame = 2;
++    }
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+ 
++    parse_extradata(avctx, s);
++
++    xlat_init(&s->xlat);
++    pts_stats_init(&s->pts_stat, avctx, "decoder");
++
+     capture = &s->capture;
+     output = &s->output;
+ 
+@@ -192,14 +1195,45 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
+      * the proper values will be retrieved from the kernel driver.
+      */
+-    output->height = capture->height = avctx->coded_height;
+-    output->width = capture->width = avctx->coded_width;
++//    output->height = capture->height = avctx->coded_height;
++//    output->width = capture->width = avctx->coded_width;
++    output->height = capture->height = 0;
++    output->width = capture->width = 0;
+ 
+     output->av_codec_id = avctx->codec_id;
+     output->av_pix_fmt  = AV_PIX_FMT_NONE;
++    output->min_buf_size = max_coded_size(avctx);
+ 
+     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+     capture->av_pix_fmt = avctx->pix_fmt;
++    capture->min_buf_size = 0;
++
++    capture->av_pix_fmt = AV_PIX_FMT_NONE;
++    s->output_drm = 0;
++
++    s->db_ctl = NULL;
++    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
++        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
++            s->db_ctl = dmabufs_ctl_new();
++        else {
++            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
++            return AVERROR(EINVAL);
++        }
++        if (!s->db_ctl) {
++            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
++            return AVERROR(ENOMEM);
++        }
++    }
++
++    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
++    if (!s->device_ref) {
++        ret = AVERROR(ENOMEM);
++        return ret;
++    }
++
++    ret = av_hwdevice_ctx_init(s->device_ref);
++    if (ret < 0)
++        return ret;
+ 
+     s->avctx = avctx;
+     ret = ff_v4l2_m2m_codec_init(priv);
+@@ -208,12 +1242,90 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
+         return ret;
+     }
+ 
+-    return v4l2_prepare_decoder(s);
++    if (avctx->extradata &&
++        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
++#if DUMP_FAILED_EXTRADATA
++        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
++#endif
++        return ret;
++    }
++
++    if ((ret = get_quirks(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = check_profile(avctx, s)) != 0) {
++        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
++        return ret;
++    }
++
++    // Size check done as part of format filtering
++    if ((ret = choose_capture_format(avctx, s)) != 0)
++        return ret;
++
++    if ((ret = v4l2_prepare_decoder(s)) < 0)
++        return ret;
++
++    return 0;
+ }
+ 
+ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ {
+-    return ff_v4l2_m2m_codec_end(avctx->priv_data);
++    int rv;
++    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
++    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
++    return rv;
++}
++
++static void v4l2_decode_flush(AVCodecContext *avctx)
++{
++    // An alternatve and more drastic form of flush is to simply do this:
++    //    v4l2_decode_close(avctx);
++    //    v4l2_decode_init(avctx);
++    // The downside is that this keeps a decoder open until all the frames
++    // associated with it have been returned.  This is a bit wasteful on
++    // possibly limited h/w resources and fails on a Pi for this reason unless
++    // more GPU mem is allocated than is the default.
++
++    V4L2m2mPriv * const priv = avctx->priv_data;
++    V4L2m2mContext * const s = priv->context;
++    V4L2Context * const output = &s->output;
++    V4L2Context * const capture = &s->capture;
++
++    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
++
++    // Reflushing everything is benign, quick and avoids having to worry about
++    // states like EOS processing so don't try to optimize out (having got it
++    // wrong once)
++
++    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
++
++    // Clear any buffered input packet
++    av_packet_unref(&s->buf_pkt);
++
++    // Clear a pending EOS
++    if (ff_v4l2_ctx_eos(capture)) {
++        // Arguably we could delay this but this is easy and doesn't require
++        // thought or extra vars
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
++        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
++    }
++
++    // V4L2 makes no guarantees about whether decoded frames are flushed or not
++    // so mark all frames we are tracking to be discarded if they appear
++    xlat_flush(&s->xlat);
++
++    // resend extradata
++    s->extdata_sent = 0;
++    // clear status vars
++    s->running = 0;
++    s->draining = 0;
++    output->done = 0;
++    capture->done = 0;
++
++    // Stream on will occur when we actually submit a new frame
++    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
+ }
+ 
+ #define OFFSET(x) offsetof(V4L2m2mPriv, x)
+@@ -222,10 +1334,17 @@ static av_cold int v4l2_decode_close(AVCodecContext *avctx)
+ static const AVOption options[] = {
+     V4L_M2M_DEFAULT_OPTS,
+     { "num_capture_buffers", "Number of buffers in the capture context",
+-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 20, INT_MAX, FLAGS },
++        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
++    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
++    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
+     { NULL},
+ };
+ 
++static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
++    HW_CONFIG_INTERNAL(DRM_PRIME),
++    NULL
++};
++
+ #define M2MDEC_CLASS(NAME) \
+     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
+         .class_name = #NAME "_v4l2m2m_decoder", \
+@@ -246,9 +1365,15 @@ static const AVOption options[] = {
+         .init           = v4l2_decode_init, \
+         .receive_frame  = v4l2_receive_frame, \
+         .close          = v4l2_decode_close, \
++        .flush          = v4l2_decode_flush, \
+         .bsfs           = bsf_name, \
+         .capabilities   = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
+         .caps_internal  = FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
++        .pix_fmts       = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
++                                                         AV_PIX_FMT_NV12, \
++                                                         AV_PIX_FMT_YUV420P, \
++                                                         AV_PIX_FMT_NONE}, \
++        .hw_configs     = v4l2_m2m_hw_configs, \
+         .wrapper_name   = "v4l2m2m", \
+     }
+ 
+diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
+index f644b50133..6472b56030 100644
+--- a/libavcodec/v4l2_m2m_enc.c
++++ b/libavcodec/v4l2_m2m_enc.c
+@@ -24,6 +24,8 @@
+ #include <linux/videodev2.h>
+ #include <sys/ioctl.h>
+ #include <search.h>
++#include <drm_fourcc.h>
++
+ #include "encode.h"
+ #include "libavcodec/avcodec.h"
+ #include "libavcodec/internal.h"
+@@ -38,6 +40,34 @@
+ #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
+ #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
+ 
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in videodev2.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
+ static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
+ {
+     struct v4l2_streamparm parm = { 0 };
+@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
+ static int v4l2_check_b_frame_support(V4L2m2mContext *s)
+ {
+     if (s->avctx->max_b_frames)
+-        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
++        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
+ 
+-    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
++    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
+     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
+     if (s->avctx->max_b_frames == 0)
+         return 0;
+ 
+     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
+-
+     return AVERROR_PATCHWELCOME;
+ }
+ 
+@@ -271,17 +300,208 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
+     return 0;
+ }
+ 
++static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
++{
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        case DRM_FORMAT_P030:
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    return 0;
++}
++
++// Do we have similar enough formats to be usable?
++static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
++{
++    if (a->type != b->type)
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
++        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
++        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
++        unsigned int i;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->num_planes != pb->num_planes)
++            return 0;
++        for (i = 0; i != pa->num_planes; ++i) {
++            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
++                return 0;
++        }
++    }
++    else {
++        const struct v4l2_pix_format *const pa = &a->fmt.pix;
++        const struct v4l2_pix_format *const pb = &b->fmt.pix;
++        if (pa->pixelformat != pb->pixelformat ||
++            pa->bytesperline != pb->bytesperline)
++            return 0;
++    }
++    return 1;
++}
++
++static inline int q_full(const V4L2Context *const output)
++{
++    return ff_v4l2_context_q_count(output) == output->num_buffers;
++}
++
+ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
+ {
+     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
+     V4L2Context *const output = &s->output;
++    int rv;
++    const int needs_slot = q_full(output);
++
++    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
++
++    // Signal EOF if needed (doesn't need q slot)
++    if (!frame) {
++        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
++        return ff_v4l2_context_enqueue_frame(output, frame);
++    }
++
++    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
++        // We should be able to return AVERROR(EAGAIN) to indicate buffer
++        // exhaustion, but ffmpeg currently treats that as fatal.
++        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
++        return rv;
++    }
++
++    if (s->input_drm && !output->streamon) {
++        struct v4l2_format req_format = {.type = output->format.type};
++
++        // Set format when we first get a buffer
++        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
++            return rv;
++        }
++
++        ff_v4l2_context_release(output);
++
++        output->format = req_format;
++
++        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
++            return rv;
++        }
++
++        if (!fmt_eq(&req_format, &output->format)) {
++            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
++            return AVERROR(EINVAL);
++        }
++
++        output->selection.top = frame->crop_top;
++        output->selection.left = frame->crop_left;
++        output->selection.width = av_frame_cropped_width(frame);
++        output->selection.height = av_frame_cropped_height(frame);
++
++        if ((rv = ff_v4l2_context_init(output)) != 0) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
++            return rv;
++        }
++
++        {
++            struct v4l2_selection selection = {
++                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
++                .target = V4L2_SEL_TGT_CROP,
++                .r = output->selection
++            };
++            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
++                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
++                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
++                       av_err2str(AVERROR(errno)));
++            }
++            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
++                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
++        }
++    }
+ 
+ #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
+-    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
++    if (frame->pict_type == AV_PICTURE_TYPE_I)
+         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
+ #endif
+ 
+-    return ff_v4l2_context_enqueue_frame(output, frame);
++    rv = ff_v4l2_context_enqueue_frame(output, frame);
++    if (rv) {
++        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
++    }
++
++    return rv;
+ }
+ 
+ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+@@ -292,6 +512,11 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+     AVFrame *frame = s->frame;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
++           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));
++
++    ff_v4l2_dq_all(output, 0);
++
+     if (s->draining)
+         goto dequeue;
+ 
+@@ -328,7 +553,115 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
+     }
+ 
+ dequeue:
+-    return ff_v4l2_context_dequeue_packet(capture, avpkt);
++    // Dequeue a frame
++    for (;;) {
++        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
++        int rv2;
++
++        // If output is full wait for either a packet or output to become not full
++        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
++
++        // If output was full retry packet dequeue
++        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
++        rv2 = ff_v4l2_dq_all(output, t);
++        if (t == 0 || rv2 != 0)
++            break;
++    }
++    if (ret)
++        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
++
++    if (capture->first_buf == 1) {
++        uint8_t * data;
++        const int len = avpkt->size;
++
++        // 1st buffer after streamon should be SPS/PPS
++        capture->first_buf = 2;
++
++        // Clear both possible stores so there is no chance of confusion
++        av_freep(&s->extdata_data);
++        s->extdata_size = 0;
++        av_freep(&avctx->extradata);
++        avctx->extradata_size = 0;
++
++        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
++            goto fail_no_mem;
++
++        memcpy(data, avpkt->data, len);
++        av_packet_unref(avpkt);
++
++        // We need to copy the header, but keep local if not global
++        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
++            avctx->extradata = data;
++            avctx->extradata_size = len;
++        }
++        else {
++            s->extdata_data = data;
++            s->extdata_size = len;
++        }
++
++        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
++        ff_v4l2_dq_all(output, 0);
++        if (ret)
++            return ret;
++    }
++
++    // First frame must be key so mark as such even if encoder forgot
++    if (capture->first_buf == 2) {
++        avpkt->flags |= AV_PKT_FLAG_KEY;
++
++        // Add any extradata to the 1st packet we emit as we cannot create it at init
++        if (avctx->extradata_size > 0 && avctx->extradata) {
++            void * const side = av_packet_new_side_data(avpkt,
++                                           AV_PKT_DATA_NEW_EXTRADATA,
++                                           avctx->extradata_size);
++            if (!side)
++                goto fail_no_mem;
++
++            memcpy(side, avctx->extradata, avctx->extradata_size);
++        }
++    }
++
++    // Add SPS/PPS to the start of every key frame if non-global headers
++    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
++        const size_t newlen = s->extdata_size + avpkt->size;
++        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
++
++        if (buf == NULL)
++            goto fail_no_mem;
++
++        memcpy(buf->data, s->extdata_data, s->extdata_size);
++        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
++
++        av_buffer_unref(&avpkt->buf);
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++        avpkt->size = newlen;
++    }
++    else if (ff_v4l2_context_q_count(capture) < 2) {
++        // Avoid running out of capture buffers
++        // In most cases the buffers will be returned quickly in which case
++        // we don't copy and can use the v4l2 buffers directly but sometimes
++        // ffmpeg seems to hold onto all of them for a long time (.mkv
++        // creation?) so avoid deadlock in those cases.
++        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
++        if (buf == NULL)
++            goto fail_no_mem;
++
++        memcpy(buf->data, avpkt->data, avpkt->size);
++        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
++
++        avpkt->buf = buf;
++        avpkt->data = buf->data;
++    }
++
++    capture->first_buf = 0;
++    return 0;
++
++fail_no_mem:
++    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
++    ret = AVERROR(ENOMEM);
++    av_packet_unref(avpkt);
++    return ret;
+ }
+ 
+ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+@@ -340,6 +673,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     uint32_t v4l2_fmt_output;
+     int ret;
+ 
++    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
++
+     ret = ff_v4l2_m2m_create_context(priv, &s);
+     if (ret < 0)
+         return ret;
+@@ -347,13 +682,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+     capture = &s->capture;
+     output  = &s->output;
+ 
++    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
++
+     /* common settings output/capture */
+     output->height = capture->height = avctx->height;
+     output->width = capture->width = avctx->width;
+ 
+     /* output context */
+     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
+-    output->av_pix_fmt = avctx->pix_fmt;
++    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
++            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
++            AV_PIX_FMT_YUV420P;
+ 
+     /* capture context */
+     capture->av_codec_id = avctx->codec_id;
+@@ -372,7 +711,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
+         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
+ 
+     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
+-    if (pix_fmt_output != avctx->pix_fmt) {
++    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
+         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
+         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
+         return AVERROR(EINVAL);
+@@ -390,9 +729,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
+ #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
+ 
+ #define V4L_M2M_CAPTURE_OPTS \
+-    V4L_M2M_DEFAULT_OPTS,\
++    { "num_output_buffers", "Number of buffers in the output context",\
++        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
+     { "num_capture_buffers", "Number of buffers in the capture context", \
+-        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
++        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }
+ 
+ static const AVOption mpeg4_options[] = {
+     V4L_M2M_CAPTURE_OPTS,
+diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
+new file mode 100644
+index 0000000000..5b3fb958fa
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.c
+@@ -0,0 +1,84 @@
++#include <memory.h>
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_decode_q.h"
++
++int decode_q_in_q(const req_decode_ent * const d)
++{
++    return d->in_q;
++}
++
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
++{
++    pthread_mutex_lock(&q->q_lock);
++    if (!q->head) {
++        q->head = d;
++        q->tail = d;
++        d->prev = NULL;
++    }
++    else {
++        q->tail->next = d;
++        d->prev = q->tail;
++        q->tail = d;
++    }
++    d->next = NULL;
++    d->in_q = 1;
++    pthread_mutex_unlock(&q->q_lock);
++}
++
++// Remove entry from Q - if head wake-up anything that was waiting
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
++{
++    int try_signal = 0;
++
++    if (!d->in_q)
++        return;
++
++    pthread_mutex_lock(&q->q_lock);
++    if (d->prev)
++        d->prev->next = d->next;
++    else {
++        try_signal = 1;  // Only need to signal if we were head
++        q->head = d->next;
++    }
++
++    if (d->next)
++        d->next->prev = d->prev;
++    else
++        q->tail = d->prev;
++
++    // Not strictly needed but makes debug easier
++    d->next = NULL;
++    d->prev = NULL;
++    d->in_q = 0;
++    pthread_mutex_unlock(&q->q_lock);
++
++    if (try_signal)
++        pthread_cond_broadcast(&q->q_cond);
++}
++
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
++{
++    pthread_mutex_lock(&q->q_lock);
++
++    while (q->head != d)
++        pthread_cond_wait(&q->q_cond, &q->q_lock);
++
++    pthread_mutex_unlock(&q->q_lock);
++}
++
++void decode_q_uninit(req_decode_q * const q)
++{
++    pthread_mutex_destroy(&q->q_lock);
++    pthread_cond_destroy(&q->q_cond);
++}
++
++void decode_q_init(req_decode_q * const q)
++{
++    memset(q, 0, sizeof(*q));
++    pthread_mutex_init(&q->q_lock, NULL);
++    pthread_cond_init(&q->q_cond, NULL);
++}
++
++
+diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
+new file mode 100644
+index 0000000000..af7bbe1de4
+--- /dev/null
++++ b/libavcodec/v4l2_req_decode_q.h
+@@ -0,0 +1,25 @@
++#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
++#define AVCODEC_V4L2_REQ_DECODE_Q_H
++
++typedef struct req_decode_ent {
++    struct req_decode_ent * next;
++    struct req_decode_ent * prev;
++    int in_q;
++} req_decode_ent;
++
++typedef struct req_decode_q {
++    pthread_mutex_t q_lock;
++    pthread_cond_t q_cond;
++    req_decode_ent * head;
++    req_decode_ent * tail;
++} req_decode_q;
++
++int decode_q_in_q(const req_decode_ent * const d);
++void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
++void decode_q_uninit(req_decode_q * const q);
++void decode_q_init(req_decode_q * const q);
++
++#endif
++
+diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
+new file mode 100644
+index 0000000000..ee8527ba1f
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.c
+@@ -0,0 +1,451 @@
++#include <errno.h>
++#include <fcntl.h>
++#include <libudev.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++
++#include <sys/ioctl.h>
++#include <sys/sysmacros.h>
++
++#include <linux/media.h>
++#include <linux/videodev2.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_utils.h"
++
++struct decdev {
++    enum v4l2_buf_type src_type;
++    uint32_t src_fmt_v4l2;
++    const char * vname;
++    const char * mname;
++};
++
++struct devscan {
++    struct decdev env;
++    unsigned int dev_size;
++    unsigned int dev_count;
++    struct decdev *devs;
++};
++
++static int video_src_pixfmt_supported(uint32_t fmt)
++{
++    return 1;
++}
++
++static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
++                  unsigned int width, unsigned int height,
++                  unsigned int pixelformat)
++{
++    unsigned int sizeimage;
++
++    memset(format, 0, sizeof(*format));
++    format->type = type;
++
++    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
++        format->fmt.pix_mp.width = width;
++        format->fmt.pix_mp.height = height;
++        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
++        format->fmt.pix_mp.pixelformat = pixelformat;
++    } else {
++        format->fmt.pix.width = width;
++        format->fmt.pix.height = height;
++        format->fmt.pix.sizeimage = sizeimage;
++        format->fmt.pix.pixelformat = pixelformat;
++    }
++}
++
++static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
++            unsigned int width, unsigned int height)
++{
++    struct v4l2_format format;
++
++    v4l2_setup_format(&format, type, width, height, pixelformat);
++
++    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
++}
++
++static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
++{
++    struct v4l2_capability capability = { 0 };
++    int rc;
++
++    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
++    if (rc < 0)
++        return -errno;
++
++    if (capabilities != NULL) {
++        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
++            *capabilities = capability.device_caps;
++        else
++            *capabilities = capability.capabilities;
++    }
++
++    return 0;
++}
++
++static int devscan_add(struct devscan *const scan,
++                       enum v4l2_buf_type src_type,
++                       uint32_t src_fmt_v4l2,
++                       const char * vname,
++                       const char * mname)
++{
++    struct decdev *d;
++
++    if (scan->dev_size <= scan->dev_count) {
++        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
++        d = realloc(scan->devs, n * sizeof(*d));
++        if (!d)
++            return -ENOMEM;
++        scan->devs = d;
++        scan->dev_size = n;
++    }
++
++    d = scan->devs + scan->dev_count;
++    d->src_type = src_type;
++    d->src_fmt_v4l2 = src_fmt_v4l2;
++    d->vname = strdup(vname);
++    if (!d->vname)
++        return -ENOMEM;
++    d->mname = strdup(mname);
++    if (!d->mname) {
++        free((char *)d->vname);
++        return -ENOMEM;
++    }
++    ++scan->dev_count;
++    return 0;
++}
++
++void devscan_delete(struct devscan **const pScan)
++{
++    unsigned int i;
++    struct devscan * const scan = *pScan;
++
++    if (!scan)
++        return;
++    *pScan = NULL;
++
++    for (i = 0; i < scan->dev_count; ++i) {
++        free((char*)scan->devs[i].mname);
++        free((char*)scan->devs[i].vname);
++    }
++    free(scan->devs);
++    free(scan);
++}
++
++#define REQ_BUF_CAPS (\
++    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
++    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
++    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
++
++static void probe_formats(void * const dc,
++              struct devscan *const scan,
++              const int fd,
++              const unsigned int type_v4l2,
++              const char *const mpath,
++              const char *const vpath)
++{
++    unsigned int i;
++    for (i = 0;; ++i) {
++        struct v4l2_fmtdesc fmtdesc = {
++            .index = i,
++            .type = type_v4l2
++        };
++        struct v4l2_requestbuffers rbufs = {
++            .count = 0,
++            .type = type_v4l2,
++            .memory = V4L2_MEMORY_MMAP
++        };
++        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++            if (errno == EINTR)
++                continue;
++            if (errno != EINVAL)
++                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
++            return;
++        }
++        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
++            continue;
++
++        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
++            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
++            continue;
++        }
++
++        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
++            if (errno != EINTR) {
++                request_debug(dc, "%s: Reqbufs failed\n", vpath);
++                continue;
++            }
++        }
++
++        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
++            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
++            continue;
++        }
++
++        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
++                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
++        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
++    }
++}
++
++
++static int probe_video_device(void * const dc,
++                   struct udev_device *const device,
++                   struct devscan *const scan,
++                   const char *const mpath)
++{
++    int ret;
++    unsigned int capabilities = 0;
++    int video_fd = -1;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        request_err(dc, "%s: get video device devnode failed\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    video_fd = open(path, O_RDWR, 0);
++    if (video_fd == -1) {
++        ret = -errno;
++        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
++        goto fail;
++    }
++
++    ret = v4l2_query_capabilities(video_fd, &capabilities);
++    if (ret < 0) {
++        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
++
++    if (!(capabilities & V4L2_CAP_STREAMING)) {
++        request_debug(dc, "%s: missing required streaming capability\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
++        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    /* Should check capture formats too... */
++    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
++        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
++    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
++        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
++
++    close(video_fd);
++    return 0;
++
++fail:
++    if (video_fd >= 0)
++        close(video_fd);
++    return ret;
++}
++
++static int probe_media_device(void * const dc,
++                   struct udev_device *const device,
++                   struct devscan *const scan)
++{
++    int ret;
++    int rv;
++    struct media_device_info device_info = { 0 };
++    struct media_v2_topology topology = { 0 };
++    struct media_v2_interface *interfaces = NULL;
++    struct udev *udev = udev_device_get_udev(device);
++    struct udev_device *video_device;
++    dev_t devnum;
++    int media_fd = -1;
++
++    const char *path = udev_device_get_devnode(device);
++    if (!path) {
++        request_err(dc, "%s: get media device devnode failed\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    media_fd = open(path, O_RDWR, 0);
++    if (media_fd < 0) {
++        ret = -errno;
++        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    if (topology.num_interfaces <= 0) {
++        request_err(dc, "%s: media device has no interfaces\n", __func__);
++        ret = -EINVAL;
++        goto fail;
++    }
++
++    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
++    if (!interfaces) {
++        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
++    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
++    if (rv < 0) {
++        ret = -errno;
++        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
++        goto fail;
++    }
++
++    for (int i = 0; i < topology.num_interfaces; i++) {
++        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
++            continue;
++
++        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
++        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
++        if (!video_device) {
++            ret = -errno;
++            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
++            continue;
++        }
++
++        ret = probe_video_device(dc, video_device, scan, path);
++        udev_device_unref(video_device);
++
++        if (ret != 0)
++            goto fail;
++    }
++
++fail:
++    free(interfaces);
++    if (media_fd != -1)
++        close(media_fd);
++    return ret;
++}
++
++const char *decdev_media_path(const struct decdev *const dev)
++{
++    return !dev ? NULL : dev->mname;
++}
++
++const char *decdev_video_path(const struct decdev *const dev)
++{
++    return !dev ? NULL : dev->vname;
++}
++
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
++{
++    return !dev ? 0 : dev->src_type;
++}
++
++uint32_t decdev_src_pixelformat(const struct decdev *const dev)
++{
++    return !dev ? 0 : dev->src_fmt_v4l2;
++}
++
++
++const struct decdev *devscan_find(struct devscan *const scan,
++                  const uint32_t src_fmt_v4l2)
++{
++    unsigned int i;
++
++    if (scan->env.mname && scan->env.vname)
++        return &scan->env;
++
++    if (!src_fmt_v4l2)
++        return scan->dev_count ? scan->devs + 0 : NULL;
++
++    for (i = 0; i != scan->dev_count; ++i) {
++        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
++            return scan->devs + i;
++    }
++    return NULL;
++}
++
++int devscan_build(void * const dc, struct devscan **pscan)
++{
++    int ret;
++    struct udev *udev;
++    struct udev_enumerate *enumerate;
++    struct udev_list_entry *devices;
++    struct udev_list_entry *entry;
++    struct udev_device *device;
++    struct devscan * scan;
++
++    *pscan = NULL;
++
++    scan = calloc(1, sizeof(*scan));
++    if (!scan) {
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
++    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
++    if (scan->env.mname && scan->env.vname) {
++        request_info(dc, "Media/video device env overrides found: %s,%s\n",
++                 scan->env.mname, scan->env.vname);
++        *pscan = scan;
++        return 0;
++    }
++
++    udev = udev_new();
++    if (!udev) {
++        request_err(dc, "%s: allocating udev context failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    enumerate = udev_enumerate_new(udev);
++    if (!enumerate) {
++        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
++        ret = -ENOMEM;
++        goto fail;
++    }
++
++    udev_enumerate_add_match_subsystem(enumerate, "media");
++    udev_enumerate_scan_devices(enumerate);
++
++    devices = udev_enumerate_get_list_entry(enumerate);
++    udev_list_entry_foreach(entry, devices) {
++        const char *path = udev_list_entry_get_name(entry);
++        if (!path)
++            continue;
++
++        device = udev_device_new_from_syspath(udev, path);
++        if (!device)
++            continue;
++
++        probe_media_device(dc, device, scan);
++        udev_device_unref(device);
++    }
++
++    udev_enumerate_unref(enumerate);
++    udev_unref(udev);
++
++    *pscan = scan;
++    return 0;
++
++fail:
++    if (udev)
++        udev_unref(udev);
++    devscan_delete(&scan);
++    return ret;
++}
++
+diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
+new file mode 100644
+index 0000000000..956d9234f1
+--- /dev/null
++++ b/libavcodec/v4l2_req_devscan.h
+@@ -0,0 +1,23 @@
++#ifndef _DEVSCAN_H_
++#define _DEVSCAN_H_
++
++#include <stdint.h>
++
++struct devscan;
++struct decdev;
++enum v4l2_buf_type;
++
++/* These return pointers to data in the devscan structure and so are vaild
++ * for the lifetime of that
++ */
++const char *decdev_media_path(const struct decdev *const dev);
++const char *decdev_video_path(const struct decdev *const dev);
++enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
++uint32_t decdev_src_pixelformat(const struct decdev *const dev);
++
++const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
++
++int devscan_build(void * const dc, struct devscan **pscan);
++void devscan_delete(struct devscan **const pScan);
++
++#endif
+diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
+new file mode 100644
+index 0000000000..acc0366e76
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.c
+@@ -0,0 +1,369 @@
++#include <stdatomic.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <unistd.h>
++#include <inttypes.h>
++#include <fcntl.h>
++#include <errno.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <linux/mman.h>
++#include <linux/dma-buf.h>
++#include <linux/dma-heap.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_utils.h"
++
++#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
++#define DMABUF_NAME2  "/dev/dma_heap/reserved"
++
++#define TRACE_ALLOC 0
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabuf_fns {
++    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
++    void (*buf_free)(struct dmabuf_h * dh);
++    int (*ctl_new)(struct dmabufs_ctl * dbsc);
++    void (*ctl_free)(struct dmabufs_ctl * dbsc);
++};
++
++struct dmabufs_ctl {
++    atomic_int ref_count;
++    int fd;
++    size_t page_size;
++    void * v;
++    const struct dmabuf_fns * fns;
++};
++
++struct dmabuf_h {
++    int fd;
++    size_t size;
++    size_t len;
++    void * mapptr;
++    void * v;
++    const struct dmabuf_fns * fns;
++};
++
++#if TRACE_ALLOC
++static unsigned int total_bufs = 0;
++static size_t total_size = 0;
++#endif
++
++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
++{
++    struct dmabuf_h *dh;
++
++    if (mapptr == MAP_FAILED)
++        return NULL;
++
++    dh = malloc(sizeof(*dh));
++    if (!dh)
++        return NULL;
++
++    *dh = (struct dmabuf_h) {
++        .fd = -1,
++        .size = size,
++        .mapptr = mapptr
++    };
++
++    return dh;
++}
++
++struct dmabuf_h * dmabuf_import(int fd, size_t size)
++{
++    struct dmabuf_h *dh;
++
++    fd = dup(fd);
++    if (fd < 0  || size == 0)
++        return NULL;
++
++    dh = malloc(sizeof(*dh));
++    if (!dh) {
++        close(fd);
++        return NULL;
++    }
++
++    *dh = (struct dmabuf_h) {
++        .fd = fd,
++        .size = size,
++        .mapptr = MAP_FAILED
++    };
++
++#if TRACE_ALLOC
++    ++total_bufs;
++    total_size += dh->size;
++    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    return dh;
++}
++
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
++{
++    struct dmabuf_h * dh;
++    if (old != NULL) {
++        if (old->size >= size) {
++            return old;
++        }
++        dmabuf_free(old);
++    }
++
++    if (size == 0 ||
++        (dh = malloc(sizeof(*dh))) == NULL)
++        return NULL;
++
++    *dh = (struct dmabuf_h){
++        .fd = -1,
++        .mapptr = MAP_FAILED,
++        .fns = dbsc->fns
++    };
++
++    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
++        goto fail;
++
++
++#if TRACE_ALLOC
++    ++total_bufs;
++    total_size += dh->size;
++    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    return dh;
++
++fail:
++    free(dh);
++    return NULL;
++}
++
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
++{
++    struct dma_buf_sync sync = {
++        .flags = flags
++    };
++    if (dh->fd == -1)
++        return 0;
++    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
++        const int err = errno;
++        if (errno == EINTR)
++            continue;
++        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
++        return -err;
++    }
++    return 0;
++}
++
++int dmabuf_write_start(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_write_end(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
++}
++
++int dmabuf_read_start(struct dmabuf_h * const dh)
++{
++    if (!dmabuf_map(dh))
++        return -1;
++    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
++}
++
++int dmabuf_read_end(struct dmabuf_h * const dh)
++{
++    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
++}
++
++
++void * dmabuf_map(struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return NULL;
++    if (dh->mapptr != MAP_FAILED)
++        return dh->mapptr;
++    dh->mapptr = mmap(NULL, dh->size,
++              PROT_READ | PROT_WRITE,
++              MAP_SHARED | MAP_POPULATE,
++              dh->fd, 0);
++    if (dh->mapptr == MAP_FAILED) {
++        request_log("%s: Map failed\n", __func__);
++        return NULL;
++    }
++    return dh->mapptr;
++}
++
++int dmabuf_fd(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return -1;
++    return dh->fd;
++}
++
++size_t dmabuf_size(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return 0;
++    return dh->size;
++}
++
++size_t dmabuf_len(const struct dmabuf_h * const dh)
++{
++    if (!dh)
++        return 0;
++    return dh->len;
++}
++
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
++{
++    dh->len = len;
++}
++
++void dmabuf_free(struct dmabuf_h * dh)
++{
++    if (!dh)
++        return;
++
++#if TRACE_ALLOC
++    --total_bufs;
++    total_size -= dh->size;
++    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
++#endif
++
++    dh->fns->buf_free(dh);
++
++    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
++        munmap(dh->mapptr, dh->size);
++    if (dh->fd != -1)
++        while (close(dh->fd) == -1 && errno == EINTR)
++            /* loop */;
++    free(dh);
++}
++
++static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
++{
++    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));
++
++    if (!dbsc)
++        return NULL;
++
++    dbsc->fd = -1;
++    dbsc->fns = fns;
++    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
++
++    if (fns->ctl_new(dbsc) != 0)
++        goto fail;
++
++    return dbsc;
++
++fail:
++    free(dbsc);
++    return NULL;
++}
++
++static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
++{
++    request_debug(NULL, "Free dmabuf ctl\n");
++
++    dbsc->fns->ctl_free(dbsc);
++
++    free(dbsc);
++}
++
++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
++{
++    struct dmabufs_ctl * const dbsc = *pDbsc;
++
++    if (!dbsc)
++        return;
++    *pDbsc = NULL;
++
++    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
++        return;
++
++    dmabufs_ctl_free(dbsc);
++}
++
++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
++{
++    atomic_fetch_add(&dbsc->ref_count, 1);
++    return dbsc;
++}
++
++//-----------------------------------------------------------------------------
++//
++// Alloc dmabuf via CMA
++
++static int ctl_cma_new(struct dmabufs_ctl * dbsc)
++{
++    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
++           errno == EINTR)
++        /* Loop */;
++
++    if (dbsc->fd == -1) {
++        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
++               errno == EINTR)
++            /* Loop */;
++        if (dbsc->fd == -1) {
++            request_log("Unable to open either %s or %s\n",
++                    DMABUF_NAME1, DMABUF_NAME2);
++            return -1;
++        }
++    }
++    return 0;
++}
++
++static void ctl_cma_free(struct dmabufs_ctl * dbsc)
++{
++    if (dbsc->fd != -1)
++        while (close(dbsc->fd) == -1 && errno == EINTR)
++            /* loop */;
++
++}
++
++static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
++{
++    struct dma_heap_allocation_data data = {
++        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
++        .fd = 0,
++        .fd_flags = O_RDWR,
++        .heap_flags = 0
++    };
++
++    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
++        int err = errno;
++        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
++                (uint64_t)data.len,
++                dbsc->fd,
++                err,
++                strerror(err));
++        if (err == EINTR)
++            continue;
++        return -err;
++    }
++
++    dh->fd = data.fd;
++    dh->size = (size_t)data.len;
++    return 0;
++}
++
++static void buf_cma_free(struct dmabuf_h * dh)
++{
++    // Nothing needed
++}
++
++static const struct dmabuf_fns dmabuf_cma_fns = {
++    .buf_alloc  = buf_cma_alloc,
++    .buf_free   = buf_cma_free,
++    .ctl_new    = ctl_cma_new,
++    .ctl_free   = ctl_cma_free,
++};
++
++struct dmabufs_ctl * dmabufs_ctl_new(void)
++{
++    request_debug(NULL, "Dmabufs using CMA\n");;
++    return dmabufs_ctl_new2(&dmabuf_cma_fns);
++}
++
+diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
+new file mode 100644
+index 0000000000..381ba2708d
+--- /dev/null
++++ b/libavcodec/v4l2_req_dmabufs.h
+@@ -0,0 +1,44 @@
++#ifndef DMABUFS_H
++#define DMABUFS_H
++
++#include <stddef.h>
++
++struct dmabufs_ctl;
++struct dmabuf_h;
++
++struct dmabufs_ctl * dmabufs_ctl_new(void);
++void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
++struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
++
++// Need not preserve old contents
++// On NULL return old buffer is freed
++struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
++
++static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
++    return dmabuf_realloc(dbsc, NULL, size);
++}
++/* Create from existing fd - dups(fd) */
++struct dmabuf_h * dmabuf_import(int fd, size_t size);
++/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
++struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
++
++void * dmabuf_map(struct dmabuf_h * const dh);
++
++/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
++int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
++
++int dmabuf_write_start(struct dmabuf_h * const dh);
++int dmabuf_write_end(struct dmabuf_h * const dh);
++int dmabuf_read_start(struct dmabuf_h * const dh);
++int dmabuf_read_end(struct dmabuf_h * const dh);
++
++int dmabuf_fd(const struct dmabuf_h * const dh);
++/* Allocated size */
++size_t dmabuf_size(const struct dmabuf_h * const dh);
++/* Bytes in use */
++size_t dmabuf_len(const struct dmabuf_h * const dh);
++/* Set bytes in use */
++void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
++void dmabuf_free(struct dmabuf_h * dh);
++
++#endif
+diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
+new file mode 100644
+index 0000000000..169b532832
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v1.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 1
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
+new file mode 100644
+index 0000000000..42af98e156
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v2.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 2
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
+new file mode 100644
+index 0000000000..dcc8d95632
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v3.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 3
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
+new file mode 100644
+index 0000000000..c35579d8e0
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_v4.c
+@@ -0,0 +1,3 @@
++#define HEVC_CTRLS_VERSION 4
++#include "v4l2_req_hevc_vx.c"
++
+diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
+new file mode 100644
+index 0000000000..b98d8464ca
+--- /dev/null
++++ b/libavcodec/v4l2_req_hevc_vx.c
+@@ -0,0 +1,1360 @@
++// File included by v4l2_req_hevc_v* - not compiled on its own
++
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#if HEVC_CTRLS_VERSION == 1
++#include "hevc-ctrls-v1.h"
++
++// Fixup renamed entries
++#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
++
++#elif HEVC_CTRLS_VERSION == 2
++#include "hevc-ctrls-v2.h"
++#elif HEVC_CTRLS_VERSION == 3
++#include "hevc-ctrls-v3.h"
++#elif HEVC_CTRLS_VERSION == 4
++#include <linux/v4l2-controls.h>
++#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
++#include "hevc-ctrls-v4.h"
++#endif
++#else
++#error Unknown HEVC_CTRLS_VERSION
++#endif
++
++#ifndef V4L2_CID_STATELESS_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
++#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
++#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
++#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
++#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
++#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
++
++#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
++#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
++#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
++#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
++#endif
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++
++#include <semaphore.h>
++#include <pthread.h>
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++// Attached to buf[0] in frame
++// Pooled in hwcontext so generally create once - 1/frame
++typedef struct V4L2MediaReqDescriptor {
++    AVDRMFrameDescriptor drm;
++
++    // Media
++    uint64_t timestamp;
++    struct qent_dst * qe_dst;
++
++    // Decode only - should be NULL by the time we emit the frame
++    struct req_decode_ent decode_ent;
++
++    struct media_request *req;
++    struct qent_src *qe_src;
++
++#if HEVC_CTRLS_VERSION >= 2
++    struct v4l2_ctrl_hevc_decode_params dec;
++#endif
++
++    size_t num_slices;
++    size_t alloced_slices;
++    struct v4l2_ctrl_hevc_slice_params * slice_params;
++    struct slice_info * slices;
++
++    size_t num_offsets;
++    size_t alloced_offsets;
++    uint32_t *offsets;
++
++} V4L2MediaReqDescriptor;
++
++struct slice_info {
++    const uint8_t * ptr;
++    size_t len; // bytes
++    size_t n_offsets;
++};
++
++// Handy container for accumulating controls before setting
++struct req_controls {
++    int has_scaling;
++    struct timeval tv;
++    struct v4l2_ctrl_hevc_sps sps;
++    struct v4l2_ctrl_hevc_pps pps;
++    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
++};
++
++//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
++
++
++// Get an FFmpeg format from the v4l2 format
++static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
++{
++    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
++            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
++    case V4L2_PIX_FMT_YUV420:
++        return AV_PIX_FMT_YUV420P;
++    case V4L2_PIX_FMT_NV12:
++        return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        return AV_PIX_FMT_RPI4_8;
++    case V4L2_PIX_FMT_NV12_10_COL128:
++        return AV_PIX_FMT_RPI4_10;
++#endif
++    default:
++        break;
++    }
++    return AV_PIX_FMT_NONE;
++}
++
++static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
++{
++    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++    return rd->timestamp;
++}
++
++static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
++{
++    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
++    rd->timestamp = dpb_stamp;
++}
++
++static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
++{
++    int32_t luma_weight_denom, chroma_weight_denom;
++    const SliceHeader *sh = &h->sh;
++
++    if (sh->slice_type == HEVC_SLICE_I ||
++        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
++        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
++        return;
++
++    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
++
++    if (h->ps.sps->chroma_format_idc)
++        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
++
++    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
++    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
++        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
++        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
++        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
++        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
++        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
++    }
++
++    if (sh->slice_type != HEVC_SLICE_B)
++        return;
++
++    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
++        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
++        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
++        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
++        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
++        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
++        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
++    }
++}
++
++#if HEVC_CTRLS_VERSION <= 2
++static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
++{
++    const HEVCFrame *frame;
++    int i;
++
++    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
++        frame = h->rps[ST_CURR_BEF].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
++    }
++
++    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
++        frame = h->rps[ST_CURR_AFT].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
++    }
++
++    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
++        frame = h->rps[LT_CURR].ref[i];
++        if (frame && timestamp == frame_capture_dpb(frame->frame))
++            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
++    }
++
++    return 0;
++}
++#endif
++
++static unsigned int
++get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
++                  const struct v4l2_hevc_dpb_entry * const entries,
++                  const unsigned int num_entries)
++{
++    uint64_t timestamp;
++
++    if (!frame)
++        return 0;
++
++    timestamp = frame_capture_dpb(frame->frame);
++
++    for (unsigned int i = 0; i < num_entries; i++) {
++        if (entries[i].timestamp == timestamp)
++            return i;
++    }
++
++    return 0;
++}
++
++static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
++{
++    unsigned int z = 0;
++    while (idx--) {
++        if (*b++ == 0) {
++            ++z;
++            if (z >= 2 && *b == 3) {
++                ++b;
++                z = 0;
++            }
++        }
++        else {
++            z = 0;
++        }
++    }
++    return b;
++}
++
++static int slice_add(V4L2MediaReqDescriptor * const rd)
++{
++    if (rd->num_slices >= rd->alloced_slices) {
++        struct v4l2_ctrl_hevc_slice_params * p2;
++        struct slice_info * s2;
++        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
++
++        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
++        if (p2 == NULL)
++            return AVERROR(ENOMEM);
++        rd->slice_params = p2;
++
++        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
++        if (s2 == NULL)
++            return AVERROR(ENOMEM);
++        rd->slices = s2;
++
++        rd->alloced_slices = n2;
++    }
++    ++rd->num_slices;
++    return 0;
++}
++
++static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
++{
++    if (rd->num_offsets + n > rd->alloced_offsets) {
++        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
++        void * p2;
++        while (rd->num_offsets + n > n2)
++            n2 *= 2;
++        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
++            return AVERROR(ENOMEM);
++        rd->offsets = p2;
++        rd->alloced_offsets = n2;
++    }
++    for (size_t i = 0; i != n; ++i)
++        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
++    return 0;
++}
++
++static unsigned int
++fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
++{
++    unsigned int i;
++    unsigned int n = 0;
++    const HEVCFrame * const pic = h->ref;
++
++    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
++        const HEVCFrame * const frame = &h->DPB[i];
++        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
++            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
++
++            entry->timestamp = frame_capture_dpb(frame->frame);
++#if HEVC_CTRLS_VERSION <= 2
++            entry->rps = find_frame_rps_type(h, entry->timestamp);
++#else
++            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
++                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
++#endif
++            entry->field_pic = frame->frame->interlaced_frame;
++
++#if HEVC_CTRLS_VERSION <= 3
++            /* TODO: Interleaved: Get the POC for each field. */
++            entry->pic_order_cnt[0] = frame->poc;
++            entry->pic_order_cnt[1] = frame->poc;
++#else
++            entry->pic_order_cnt_val = frame->poc;
++#endif
++        }
++    }
++    return n;
++}
++
++static void fill_slice_params(const HEVCContext * const h,
++#if HEVC_CTRLS_VERSION >= 2
++                              const struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++                              struct v4l2_ctrl_hevc_slice_params *slice_params,
++                              uint32_t bit_size, uint32_t bit_offset)
++{
++    const SliceHeader * const sh = &h->sh;
++#if HEVC_CTRLS_VERSION >= 2
++    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
++    const unsigned int dpb_n = dec->num_active_dpb_entries;
++#else
++    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
++    unsigned int dpb_n;
++#endif
++    unsigned int i;
++    RefPicList *rpl;
++
++    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
++        .bit_size = bit_size,
++#if HEVC_CTRLS_VERSION <= 3
++        .data_bit_offset = bit_offset,
++#else
++        .data_byte_offset = bit_offset / 8 + 1,
++#endif
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_segment_addr = sh->slice_segment_addr,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
++        .nal_unit_type = h->nal_unit_type,
++        .nuh_temporal_id_plus1 = h->temporal_id + 1,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .slice_type = sh->slice_type,
++        .colour_plane_id = sh->colour_plane_id,
++        .slice_pic_order_cnt = h->ref->poc,
++        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
++        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
++        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
++        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
++        .slice_qp_delta = sh->slice_qp_delta,
++        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
++        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
++        .slice_act_y_qp_offset = 0,
++        .slice_act_cb_qp_offset = 0,
++        .slice_act_cr_qp_offset = 0,
++        .slice_beta_offset_div2 = sh->beta_offset / 2,
++        .slice_tc_offset_div2 = sh->tc_offset / 2,
++
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
++        .pic_struct = h->sei.picture_timing.picture_struct,
++
++#if HEVC_CTRLS_VERSION < 2
++        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
++        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++#endif
++    };
++
++    if (sh->slice_sample_adaptive_offset_flag[0])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
++
++    if (sh->slice_sample_adaptive_offset_flag[1])
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
++
++    if (sh->slice_temporal_mvp_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
++
++    if (sh->mvd_l1_zero_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
++
++    if (sh->cabac_init_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
++
++    if (sh->collocated_list == L0)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
++
++    if (sh->disable_deblocking_filter_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
++
++    if (sh->slice_loop_filter_across_slices_enabled_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (sh->dependent_slice_segment_flag)
++        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
++
++#if HEVC_CTRLS_VERSION < 2
++    dpb_n = fill_dpb_entries(h, dpb);
++    slice_params->num_active_dpb_entries = dpb_n;
++#endif
++
++    if (sh->slice_type != HEVC_SLICE_I) {
++        rpl = &h->ref->refPicList[0];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++    }
++
++    if (sh->slice_type == HEVC_SLICE_B) {
++        rpl = &h->ref->refPicList[1];
++        for (i = 0; i < rpl->nb_refs; i++)
++            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
++    }
++
++    fill_pred_table(h, &slice_params->pred_weight_table);
++
++    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
++#if HEVC_CTRLS_VERSION <= 3
++    if (slice_params->num_entry_point_offsets > 256) {
++        slice_params->num_entry_point_offsets = 256;
++        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
++    }
++
++    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
++        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
++#endif
++}
++
++#if HEVC_CTRLS_VERSION >= 2
++static void
++fill_decode_params(const HEVCContext * const h,
++                   struct v4l2_ctrl_hevc_decode_params * const dec)
++{
++    unsigned int i;
++
++    *dec = (struct v4l2_ctrl_hevc_decode_params){
++        .pic_order_cnt_val = h->poc,
++        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
++        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
++        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
++    };
++
++    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
++
++    // The docn does seem to ask that we fit our 32 bit signed POC into
++    // a U8 so... (To be fair 16 bits would be enough)
++    // Luckily we (Pi) don't use these fields
++    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
++        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
++    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
++        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
++    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
++        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
++
++    if (IS_IRAP(h))
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
++    if (IS_IDR(h))
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
++    if (h->sh.no_output_of_prior_pics_flag)
++        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
++
++}
++#endif
++
++static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
++{
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_sps) {
++        .chroma_format_idc = sps->chroma_format_idc,
++        .pic_width_in_luma_samples = sps->width,
++        .pic_height_in_luma_samples = sps->height,
++        .bit_depth_luma_minus8 = sps->bit_depth - 8,
++        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
++        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
++        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
++        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
++        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
++        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
++        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
++        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
++        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
++        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
++        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
++        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
++        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
++        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
++        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
++        .num_short_term_ref_pic_sets = sps->nb_st_rps,
++        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
++        .chroma_format_idc = sps->chroma_format_idc,
++        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
++    };
++
++    if (sps->separate_colour_plane_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
++
++    if (sps->scaling_list_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
++
++    if (sps->amp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
++
++    if (sps->sao_enabled)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
++
++    if (sps->pcm_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
++
++    if (sps->pcm.loop_filter_disable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
++
++    if (sps->long_term_ref_pics_present_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
++
++    if (sps->sps_temporal_mvp_enabled_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
++
++    if (sps->sps_strong_intra_smoothing_enable_flag)
++        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
++}
++
++static void fill_scaling_matrix(const ScalingList * const sl,
++                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
++{
++    unsigned int i;
++
++    for (i = 0; i < 6; i++) {
++        unsigned int j;
++
++        for (j = 0; j < 16; j++)
++            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
++        for (j = 0; j < 64; j++) {
++            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
++            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
++            if (i < 2)
++                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
++        }
++        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
++        if (i < 2)
++            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
++    }
++}
++
++static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
++{
++    uint64_t flags = 0;
++
++    if (pps->dependent_slice_segments_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
++
++    if (pps->output_flag_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
++
++    if (pps->sign_data_hiding_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
++
++    if (pps->cabac_init_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
++
++    if (pps->constrained_intra_pred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
++
++    if (pps->transform_skip_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
++
++    if (pps->cu_qp_delta_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
++
++    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
++
++    if (pps->weighted_pred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
++
++    if (pps->weighted_bipred_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
++
++    if (pps->transquant_bypass_enable_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
++
++    if (pps->tiles_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
++
++    if (pps->entropy_coding_sync_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
++
++    if (pps->loop_filter_across_tiles_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
++
++    if (pps->seq_loop_filter_across_slices_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
++
++    if (pps->deblocking_filter_override_enabled_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
++
++    if (pps->disable_dbf)
++        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
++
++    if (pps->lists_modification_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
++
++    if (pps->slice_header_extension_present_flag)
++        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
++
++    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
++    *ctrl = (struct v4l2_ctrl_hevc_pps) {
++        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
++        .init_qp_minus26 = pps->pic_init_qp_minus26,
++        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
++        .pps_cb_qp_offset = pps->cb_qp_offset,
++        .pps_cr_qp_offset = pps->cr_qp_offset,
++        .pps_beta_offset_div2 = pps->beta_offset / 2,
++        .pps_tc_offset_div2 = pps->tc_offset / 2,
++        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
++        .flags = flags
++    };
++
++
++    if (pps->tiles_enabled_flag) {
++        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
++        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
++
++        for (int i = 0; i < pps->num_tile_columns; i++)
++            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
++
++        for (int i = 0; i < pps->num_tile_rows; i++)
++            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
++    }
++}
++
++// Called before finally returning the frame to the user
++// Set corrupt flag here as this is actually the frame structure that
++// is going to the user (in MT land each thread has its own pool)
++static int frame_post_process(void *logctx, AVFrame *frame)
++{
++    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
++
++//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
++    if (rd->qe_dst) {
++        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
++        if (stat != MEDIABUFS_STATUS_SUCCESS) {
++            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
++            frame->flags |= AV_FRAME_FLAG_CORRUPT;
++        }
++    }
++
++    return 0;
++}
++
++static inline struct timeval cvt_dpb_to_tv(uint64_t t)
++{
++    t /= 1000;
++    return (struct timeval){
++        .tv_usec = t % 1000000,
++        .tv_sec = t / 1000000
++    };
++}
++
++static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
++{
++    return (uint64_t)t * 1000;
++}
++
++static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
++                                         av_unused const uint8_t *buffer,
++                                         av_unused uint32_t size)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
++    decode_q_add(&ctx->decode_q, &rd->decode_ent);
++
++    rd->num_slices = 0;
++    ctx->timestamp++;
++    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
++
++    {
++        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
++        fdd->post_process = frame_post_process;
++    }
++
++    // qe_dst needs to be bound to the data buffer and only returned when that is
++    if (!rd->qe_dst)
++    {
++        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++            return AVERROR(ENOMEM);
++        }
++    }
++
++    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
++
++    return 0;
++}
++
++// Object fd & size will be zapped by this & need setting later
++static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
++{
++    AVDRMLayerDescriptor *layer = &desc->layers[0];
++    unsigned int width;
++    unsigned int height;
++    unsigned int bpl;
++    uint32_t pixelformat;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        width       = format->fmt.pix_mp.width;
++        height      = format->fmt.pix_mp.height;
++        pixelformat = format->fmt.pix_mp.pixelformat;
++        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
++    }
++    else {
++        width       = format->fmt.pix.width;
++        height      = format->fmt.pix.height;
++        pixelformat = format->fmt.pix.pixelformat;
++        bpl         = format->fmt.pix.bytesperline;
++    }
++
++    switch (pixelformat) {
++    case V4L2_PIX_FMT_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++        break;
++    case V4L2_PIX_FMT_NV12_10_COL128:
++        layer->format = DRM_FORMAT_P030;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
++        break;
++#endif
++#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
++    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
++        layer->format = DRM_FORMAT_NV12;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
++        break;
++#endif
++#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
++    case V4L2_PIX_FMT_NV15:
++        layer->format = DRM_FORMAT_NV15;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#endif
++    case V4L2_PIX_FMT_NV16:
++        layer->format = DRM_FORMAT_NV16;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
++    case V4L2_PIX_FMT_NV20:
++        layer->format = DRM_FORMAT_NV20;
++        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
++        break;
++#endif
++    default:
++        return -1;
++    }
++
++    desc->nb_objects = 1;
++    desc->objects[0].fd = -1;
++    desc->objects[0].size = 0;
++
++    desc->nb_layers = 1;
++    layer->nb_planes = 2;
++
++    layer->planes[0].object_index = 0;
++    layer->planes[0].offset = 0;
++    layer->planes[0].pitch = bpl;
++#if CONFIG_SAND
++    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = height * 128;
++        layer->planes[0].pitch = width;
++        layer->planes[1].pitch = width;
++    }
++    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = height * 128;
++        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
++        layer->planes[1].pitch = width * 2;
++    }
++    else
++#endif
++    {
++        layer->planes[1].object_index = 0;
++        layer->planes[1].offset = layer->planes[0].pitch * height;
++        layer->planes[1].pitch = layer->planes[0].pitch;
++    }
++
++    return 0;
++}
++
++static int
++set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
++    struct req_controls *const controls,
++#if HEVC_CTRLS_VERSION >= 2
++    struct v4l2_ctrl_hevc_decode_params * const dec,
++#endif
++    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
++    void * const offsets, const size_t offset_count)
++{
++    int rv;
++#if HEVC_CTRLS_VERSION >= 2
++    unsigned int n = 3;
++#else
++    unsigned int n = 2;
++#endif
++
++    struct v4l2_ext_control control[6] = {
++        {
++            .id = V4L2_CID_STATELESS_HEVC_SPS,
++            .ptr = &controls->sps,
++            .size = sizeof(controls->sps),
++        },
++        {
++            .id = V4L2_CID_STATELESS_HEVC_PPS,
++            .ptr = &controls->pps,
++            .size = sizeof(controls->pps),
++        },
++#if HEVC_CTRLS_VERSION >= 2
++        {
++            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
++            .ptr = dec,
++            .size = sizeof(*dec),
++        },
++#endif
++    };
++
++    if (slices)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
++            .ptr = slices,
++            .size = sizeof(*slices) * slice_count,
++        };
++
++    if (controls->has_scaling)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
++            .ptr = &controls->scaling_matrix,
++            .size = sizeof(controls->scaling_matrix),
++        };
++
++#if HEVC_CTRLS_VERSION >= 4
++    if (offsets)
++        control[n++] = (struct v4l2_ext_control) {
++            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
++            .ptr = offsets,
++            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
++        };
++#endif
++
++    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
++
++    return rv;
++}
++
++// This only works because we started out from a single coded frame buffer
++// that will remain intact until after end_frame
++static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++    int bcount = get_bits_count(&h->HEVClc->gb);
++    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
++
++    const unsigned int n = rd->num_slices;
++    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
++
++    int rv;
++    struct slice_info * si;
++
++    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
++    // that contains the entire frame including the start code
++    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
++        buffer -= 3;
++        size += 3;
++        boff += 24;
++        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
++            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
++                   buffer[0], buffer[1], buffer[2]);
++        }
++    }
++
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
++        if (rd->slices == NULL) {
++            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
++                return AVERROR(ENOMEM);
++            rd->slices->ptr = buffer;
++            rd->num_slices = 1;
++        }
++        rd->slices->len = buffer - rd->slices->ptr + size;
++        return 0;
++    }
++
++    if ((rv = slice_add(rd)) != 0)
++        return rv;
++
++    si = rd->slices + n;
++    si->ptr = buffer;
++    si->len = size;
++    si->n_offsets = rd->num_offsets;
++
++    if (n != block_start) {
++        struct slice_info *const si0 = rd->slices + block_start;
++        const size_t offset = (buffer - si0->ptr);
++        boff += offset * 8;
++        size += offset;
++        si0->len = si->len + offset;
++    }
++
++#if HEVC_CTRLS_VERSION >= 2
++    if (n == 0)
++        fill_decode_params(h, &rd->dec);
++    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
++#else
++    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
++#endif
++    if (ctx->max_offsets != 0 &&
++        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
++        return rv;
++
++    return 0;
++}
++
++static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    if (h->ref != NULL) {
++        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
++        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++        media_request_abort(&rd->req);
++        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
++
++        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    }
++}
++
++static int send_slice(AVCodecContext * const avctx,
++                      V4L2MediaReqDescriptor * const rd,
++                      struct req_controls *const controls,
++                      const unsigned int i, const unsigned int j)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++    const int is_last = (j == rd->num_slices);
++    struct slice_info *const si = rd->slices + i;
++    struct media_request * req = NULL;
++    struct qent_src * src = NULL;
++    MediaBufsStatus stat;
++    void * offsets = rd->offsets + rd->slices[i].n_offsets;
++    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
++
++    if ((req = media_request_get(ctx->mpool)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
++        return AVERROR(ENOMEM);
++    }
++
++    if (set_req_ctls(ctx, req,
++                     controls,
++#if HEVC_CTRLS_VERSION >= 2
++                     &rd->dec,
++#endif
++                     rd->slice_params + i, j - i,
++                     offsets, n_offsets)) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
++        goto fail1;
++    }
++
++    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
++        goto fail1;
++    }
++
++    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
++        goto fail2;
++    }
++
++    if (qent_src_params_set(src, &controls->tv)) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
++        goto fail2;
++    }
++
++    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
++                                   i == 0 ? rd->qe_dst : NULL,
++                                   is_last);
++
++    if (stat != MEDIABUFS_STATUS_SUCCESS) {
++        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
++        return AVERROR_UNKNOWN;
++    }
++    return 0;
++
++fail2:
++    mediabufs_src_qent_abort(ctx->mbufs, &src);
++fail1:
++    media_request_abort(&req);
++    return AVERROR_UNKNOWN;
++}
++
++static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
++{
++    const HEVCContext * const h = avctx->priv_data;
++    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    struct req_controls rc;
++    unsigned int i;
++    int rv;
++
++    // It is possible, though maybe a bug, to get an end_frame without
++    // a previous start_frame.  If we do then give up.
++    if (!decode_q_in_q(&rd->decode_ent)) {
++        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
++        return AVERROR_INVALIDDATA;
++    }
++
++    {
++        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
++                                    &h->ps.pps->scaling_list :
++                                h->ps.sps->scaling_list_enable_flag ?
++                                    &h->ps.sps->scaling_list : NULL;
++
++
++        memset(&rc, 0, sizeof(rc));
++        rc.tv = cvt_dpb_to_tv(rd->timestamp);
++        fill_sps(&rc.sps, h->ps.sps);
++        fill_pps(&rc.pps, h->ps.pps);
++        if (sl) {
++            rc.has_scaling = 1;
++            fill_scaling_matrix(sl, &rc.scaling_matrix);
++        }
++    }
++
++    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
++
++    // qe_dst needs to be bound to the data buffer and only returned when that is
++    // Alloc almost certainly wants to be serialised if there is any chance of blocking
++    // so we get the next frame to be free in the thread that needs it for decode first.
++    //
++    // In our current world this probably isn't a concern but put it here anyway
++    if (!rd->qe_dst)
++    {
++        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
++            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++    }
++
++    // Send as slices
++    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
++        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
++        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
++            goto fail;
++    }
++
++    // Set the drm_prime desriptor
++    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
++    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
++    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
++
++    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    return 0;
++
++fail:
++    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
++    return rv;
++}
++
++static inline int
++ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
++{
++    return v >= c->minimum && v <= c->maximum;
++}
++
++// Initial check & init
++static int
++probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    const HEVCSPS * const sps = h->ps.sps;
++    struct v4l2_ctrl_hevc_sps ctrl_sps;
++    unsigned int i;
++
++    // Check for var slice array
++    struct v4l2_query_ext_ctrl qc[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SPS },
++        { .id = V4L2_CID_STATELESS_HEVC_PPS },
++        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
++#if HEVC_CTRLS_VERSION >= 2
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
++#endif
++    };
++    // Order & size must match!
++    static const size_t ctrl_sizes[] = {
++        sizeof(struct v4l2_ctrl_hevc_slice_params),
++        sizeof(int32_t),
++        sizeof(struct v4l2_ctrl_hevc_sps),
++        sizeof(struct v4l2_ctrl_hevc_pps),
++        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
++#if HEVC_CTRLS_VERSION >= 2
++        sizeof(struct v4l2_ctrl_hevc_decode_params),
++#endif
++    };
++    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
++
++#if HEVC_CTRLS_VERSION == 2
++    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#elif HEVC_CTRLS_VERSION == 3
++    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
++        return AVERROR(EINVAL);
++#endif
++
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
++    i = 0;
++#if HEVC_CTRLS_VERSION >= 4
++    // Skip slice check if no slice mode
++    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        i = 1;
++#else
++    // Fail frame mode silently for anything prior to V4
++    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        return AVERROR(EINVAL);
++#endif
++    for (; i != noof_ctrls; ++i) {
++        if (qc[i].type == 0) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
++            return AVERROR(EINVAL);
++        }
++        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
++            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
++                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
++            return AVERROR(EINVAL);
++        }
++    }
++
++    fill_sps(&ctrl_sps, sps);
++
++    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++// Final init
++static int
++set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
++{
++    int ret;
++
++    struct v4l2_query_ext_ctrl querys[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
++#if HEVC_CTRLS_VERSION >= 4
++        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
++#endif
++    };
++
++    struct v4l2_ext_control ctrls[] = {
++        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
++        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
++    };
++
++    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
++
++    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
++                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
++        1 : querys[2].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
++
++#if HEVC_CTRLS_VERSION >= 4
++    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
++        0 : querys[3].dims[0];
++    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
++#else
++    ctx->max_offsets = 0;
++#endif
++
++    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
++        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
++        ctx->decode_mode = querys[0].default_value;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
++    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
++        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
++        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
++        ctx->start_code = querys[1].default_value;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
++    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++    else {
++        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    // If we are in slice mode & START_CODE_NONE supported then pick that
++    // as it doesn't require the slightly dodgy look backwards in our raw buffer
++    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
++        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
++        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
++
++    ctrls[0].value = ctx->decode_mode;
++    ctrls[1].value = ctx->start_code;
++
++    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
++    return !ret ? 0 : AVERROR(-ret);
++}
++
++static void v4l2_req_frame_free(void *opaque, uint8_t *data)
++{
++    AVCodecContext *avctx = opaque;
++    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
++
++    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
++
++    qent_dst_unref(&rd->qe_dst);
++
++    // We don't expect req or qe_src to be set
++    if (rd->req || rd->qe_src)
++        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
++
++    av_freep(&rd->slices);
++    av_freep(&rd->slice_params);
++    av_freep(&rd->offsets);
++
++    av_free(rd);
++}
++
++static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
++{
++    AVCodecContext *avctx = opaque;
++//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++//    V4L2MediaReqDescriptor *req;
++    AVBufferRef *ref;
++    uint8_t *data;
++//    int ret;
++
++    data = av_mallocz(size);
++    if (!data)
++        return NULL;
++
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
++    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
++    if (!ref) {
++        av_freep(&data);
++        return NULL;
++    }
++    return ref;
++}
++
++#if 0
++static void v4l2_req_pool_free(void *opaque)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
++}
++
++static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
++{
++    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
++
++    av_buffer_pool_uninit(&hwfc->pool);
++}
++#endif
++
++static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
++    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
++
++    hwfc->format = AV_PIX_FMT_DRM_PRIME;
++    hwfc->sw_format = pixel_format_from_format(vfmt);
++    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
++        hwfc->width = vfmt->fmt.pix_mp.width;
++        hwfc->height = vfmt->fmt.pix_mp.height;
++    } else {
++        hwfc->width = vfmt->fmt.pix.width;
++        hwfc->height = vfmt->fmt.pix.height;
++    }
++#if 0
++    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
++    if (!hwfc->pool)
++        return AVERROR(ENOMEM);
++
++    hwfc->free = v4l2_req_hwframe_ctx_free;
++
++    hwfc->initial_pool_size = 1;
++
++    switch (avctx->codec_id) {
++    case AV_CODEC_ID_VP9:
++        hwfc->initial_pool_size += 8;
++        break;
++    case AV_CODEC_ID_VP8:
++        hwfc->initial_pool_size += 3;
++        break;
++    default:
++        hwfc->initial_pool_size += 2;
++    }
++#endif
++    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
++
++    return 0;
++}
++
++static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    int rv;
++
++    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
++    if (!frame->buf[0])
++        return AVERROR(ENOMEM);
++
++    frame->data[0] = frame->buf[0]->data;
++
++    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
++
++    if ((rv = ff_attach_decode_data(frame)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
++        av_frame_unref(frame);
++        return rv;
++    }
++
++    return 0;
++}
++
++const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
++    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
++    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
++    .probe = probe,
++    .set_controls = set_controls,
++
++    .start_frame    = v4l2_request_hevc_start_frame,
++    .decode_slice   = v4l2_request_hevc_decode_slice,
++    .end_frame      = v4l2_request_hevc_end_frame,
++    .abort_frame    = v4l2_request_hevc_abort_frame,
++    .frame_params   = frame_params,
++    .alloc_frame    = alloc_frame,
++};
++
+diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
+new file mode 100644
+index 0000000000..1a9944774a
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.c
+@@ -0,0 +1,1802 @@
++/*
++ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#include <errno.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <string.h>
++#include <unistd.h>
++#include <linux/media.h>
++#include <linux/mman.h>
++#include <sys/ioctl.h>
++#include <sys/select.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++
++#include <linux/videodev2.h>
++
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++#include "weak_link.h"
++
++
++/* floor(log2(x)) */
++static unsigned int log2_size(size_t x)
++{
++    unsigned int n = 0;
++
++    if (x & ~0xffff) {
++        n += 16;
++        x >>= 16;
++    }
++    if (x & ~0xff) {
++        n += 8;
++        x >>= 8;
++    }
++    if (x & ~0xf) {
++        n += 4;
++        x >>= 4;
++    }
++    if (x & ~3) {
++        n += 2;
++        x >>= 2;
++    }
++    return (x & ~1) ? n + 1 : n;
++}
++
++static size_t round_up_size(const size_t x)
++{
++    /* Admit no size < 256 */
++    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
++
++    return x >= (3 << n) ? 4 << n : (3 << n);
++}
++
++struct media_request;
++
++struct media_pool {
++    int fd;
++    sem_t sem;
++    pthread_mutex_t lock;
++    struct media_request * free_reqs;
++    struct pollqueue * pq;
++};
++
++struct media_request {
++    struct media_request * next;
++    struct media_pool * mp;
++    int fd;
++    struct polltask * pt;
++};
++
++static inline enum v4l2_memory
++mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
++{
++    return (enum v4l2_memory)m;
++}
++
++const char *
++mediabufs_memory_name(const enum mediabufs_memory m)
++{
++    switch (m) {
++    case MEDIABUFS_MEMORY_UNSET:
++        return "Unset";
++    case MEDIABUFS_MEMORY_MMAP:
++        return "MMap";
++    case MEDIABUFS_MEMORY_USERPTR:
++        return "UserPtr";
++    case MEDIABUFS_MEMORY_OVERLAY:
++        return "Overlay";
++    case MEDIABUFS_MEMORY_DMABUF:
++        return "DMABuf";
++    default:
++        break;
++    }
++    return "Unknown";
++}
++
++
++static inline int do_trywait(sem_t *const sem)
++{
++    while (sem_trywait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static inline int do_wait(sem_t *const sem)
++{
++    while (sem_wait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static int request_buffers(int video_fd, unsigned int type,
++                           enum mediabufs_memory memory, unsigned int buffers_count)
++{
++    struct v4l2_requestbuffers buffers;
++    int rc;
++
++    memset(&buffers, 0, sizeof(buffers));
++    buffers.type = type;
++    buffers.memory = mediabufs_memory_to_v4l2(memory);
++    buffers.count = buffers_count;
++
++    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
++    if (rc < 0) {
++        rc = -errno;
++        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
++        return rc;
++    }
++
++    return 0;
++}
++
++
++static int set_stream(int video_fd, unsigned int type, bool enable)
++{
++    enum v4l2_buf_type buf_type = type;
++    int rc;
++
++    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
++           &buf_type);
++    if (rc < 0) {
++        rc = -errno;
++        request_log("Unable to %sable stream: %s\n",
++                enable ? "en" : "dis", strerror(-rc));
++        return rc;
++    }
++
++    return 0;
++}
++
++
++
++struct media_request * media_request_get(struct media_pool * const mp)
++{
++    struct media_request *req = NULL;
++
++    /* Timeout handled by poll code */
++    if (do_wait(&mp->sem))
++        return NULL;
++
++    pthread_mutex_lock(&mp->lock);
++    req = mp->free_reqs;
++    if (req) {
++        mp->free_reqs = req->next;
++        req->next = NULL;
++    }
++    pthread_mutex_unlock(&mp->lock);
++    return req;
++}
++
++int media_request_fd(const struct media_request * const req)
++{
++    return req->fd;
++}
++
++int media_request_start(struct media_request * const req)
++{
++    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
++    {
++        const int err = errno;
++        if (err == EINTR)
++            continue;
++        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
++        return -err;
++    }
++
++    pollqueue_add_task(req->pt, 2000);
++    return 0;
++}
++
++static void media_request_done(void *v, short revents)
++{
++    struct media_request *const req = v;
++    struct media_pool *const mp = req->mp;
++
++    /* ** Not sure what to do about timeout */
++
++    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
++        request_log("Unable to reinit media request: %s\n",
++                strerror(errno));
++
++    pthread_mutex_lock(&mp->lock);
++    req->next = mp->free_reqs;
++    mp->free_reqs = req;
++    pthread_mutex_unlock(&mp->lock);
++    sem_post(&mp->sem);
++}
++
++int media_request_abort(struct media_request ** const preq)
++{
++    struct media_request * const req = *preq;
++
++    if (req == NULL)
++        return 0;
++    *preq = NULL;
++
++    media_request_done(req, 0);
++    return 0;
++}
++
++static void delete_req_chain(struct media_request * const chain)
++{
++    struct media_request * next = chain;
++    while (next) {
++        struct media_request * const req = next;
++        next = req->next;
++        if (req->pt)
++            polltask_delete(&req->pt);
++        if (req->fd != -1)
++            close(req->fd);
++        free(req);
++    }
++}
++
++struct media_pool * media_pool_new(const char * const media_path,
++                   struct pollqueue * const pq,
++                   const unsigned int n)
++{
++    struct media_pool * const mp = calloc(1, sizeof(*mp));
++    unsigned int i;
++
++    if (!mp)
++        goto fail0;
++
++    mp->pq = pq;
++    pthread_mutex_init(&mp->lock, NULL);
++    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
++    if (mp->fd == -1) {
++        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
++        goto fail1;
++    }
++
++    for (i = 0; i != n; ++i) {
++        struct media_request * req = malloc(sizeof(*req));
++        if (!req)
++            goto fail4;
++
++        *req = (struct media_request){
++            .next = mp->free_reqs,
++            .mp = mp,
++            .fd = -1
++        };
++        mp->free_reqs = req;
++
++        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
++            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
++            goto fail4;
++        }
++
++        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
++        if (!req->pt)
++            goto fail4;
++    }
++
++    sem_init(&mp->sem, 0, n);
++
++    return mp;
++
++fail4:
++    delete_req_chain(mp->free_reqs);
++    close(mp->fd);
++    pthread_mutex_destroy(&mp->lock);
++fail1:
++    free(mp);
++fail0:
++    return NULL;
++}
++
++void media_pool_delete(struct media_pool ** pMp)
++{
++    struct media_pool * const mp = *pMp;
++
++    if (!mp)
++        return;
++    *pMp = NULL;
++
++    delete_req_chain(mp->free_reqs);
++    close(mp->fd);
++    sem_destroy(&mp->sem);
++    pthread_mutex_destroy(&mp->lock);
++    free(mp);
++}
++
++
++#define INDEX_UNSET (~(uint32_t)0)
++
++enum qent_status {
++    QENT_NEW = 0,       // Initial state - shouldn't last
++    QENT_FREE,          // On free chain
++    QENT_PENDING,       // User has ent
++    QENT_WAITING,       // On inuse
++    QENT_DONE,          // Frame rx
++    QENT_ERROR,         // Error
++    QENT_IMPORT
++};
++
++struct qent_base {
++    atomic_int ref_count;
++    struct qent_base *next;
++    struct qent_base *prev;
++    enum qent_status status;
++    enum mediabufs_memory memtype;
++    uint32_t index;
++    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
++    struct timeval timestamp;
++};
++
++struct qent_src {
++    struct qent_base base;
++    int fixed_size;
++};
++
++struct qent_dst {
++    struct qent_base base;
++    bool waiting;
++    pthread_mutex_t lock;
++    pthread_cond_t cond;
++    struct ff_weak_link_client * mbc_wl;
++};
++
++struct qe_list_head {
++    struct qent_base *head;
++    struct qent_base *tail;
++};
++
++struct buf_pool {
++    enum mediabufs_memory memtype;
++    pthread_mutex_t lock;
++    sem_t free_sem;
++    struct qe_list_head free;
++    struct qe_list_head inuse;
++};
++
++
++static inline struct qent_dst *base_to_dst(struct qent_base *be)
++{
++    return (struct qent_dst *)be;
++}
++
++static inline struct qent_src *base_to_src(struct qent_base *be)
++{
++    return (struct qent_src *)be;
++}
++
++
++#define QENT_BASE_INITIALIZER(mtype) {\
++    .ref_count = ATOMIC_VAR_INIT(0),\
++    .status = QENT_NEW,\
++    .memtype = (mtype),\
++    .index  = INDEX_UNSET\
++}
++
++static void qe_base_uninit(struct qent_base *const be)
++{
++    unsigned int i;
++    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
++        dmabuf_free(be->dh[i]);
++        be->dh[i] = NULL;
++    }
++}
++
++static void qe_src_free(struct qent_src *const be_src)
++{
++    if (!be_src)
++        return;
++    qe_base_uninit(&be_src->base);
++    free(be_src);
++}
++
++static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
++{
++    struct qent_src *const be_src = malloc(sizeof(*be_src));
++    if (!be_src)
++        return NULL;
++    *be_src = (struct qent_src){
++        .base = QENT_BASE_INITIALIZER(mtype)
++    };
++    return be_src;
++}
++
++static void qe_dst_free(struct qent_dst *const be_dst)
++{
++    if (!be_dst)
++        return;
++
++    ff_weak_link_unref(&be_dst->mbc_wl);
++    pthread_cond_destroy(&be_dst->cond);
++    pthread_mutex_destroy(&be_dst->lock);
++    qe_base_uninit(&be_dst->base);
++    free(be_dst);
++}
++
++static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
++{
++    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
++    if (!be_dst)
++        return NULL;
++    *be_dst = (struct qent_dst){
++        .base = QENT_BASE_INITIALIZER(memtype),
++        .lock = PTHREAD_MUTEX_INITIALIZER,
++        .cond = PTHREAD_COND_INITIALIZER,
++        .mbc_wl = ff_weak_link_ref(wl)
++    };
++    return be_dst;
++}
++
++static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
++{
++    if (ql->tail)
++        ql->tail->next = be;
++    else
++        ql->head = be;
++    be->prev = ql->tail;
++    be->next = NULL;
++    ql->tail = be;
++}
++
++static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
++{
++    if (!be)
++        return NULL;
++
++    if (be->next)
++        be->next->prev = be->prev;
++    else
++        ql->tail = be->prev;
++    if (be->prev)
++        be->prev->next = be->next;
++    else
++        ql->head = be->next;
++    be->next = NULL;
++    be->prev = NULL;
++    return be;
++}
++
++
++static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
++{
++    ql_add_tail(&bp->free, be);
++}
++
++static struct qent_base * bq_get_free(struct buf_pool *const bp)
++{
++    return ql_extract(&bp->free, bp->free.head);
++}
++
++static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
++{
++    return ql_extract(&bp->inuse, be);
++}
++
++static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
++{
++    return ql_extract(&bp->inuse, bp->inuse.head);
++}
++
++static void bq_free_all_free_src(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_free(bp)) != NULL)
++        qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_inuse_src(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_inuse(bp)) != NULL)
++        qe_src_free(base_to_src(be));
++}
++
++static void bq_free_all_free_dst(struct buf_pool *const bp)
++{
++    struct qent_base *be;
++    while ((be = bq_get_free(bp)) != NULL)
++        qe_dst_free(base_to_dst(be));
++}
++
++static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
++{
++    unsigned int i;
++
++    pthread_mutex_lock(&bp->lock);
++    /* Clear out state vars */
++    be->timestamp.tv_sec = 0;
++    be->timestamp.tv_usec = 0;
++    be->status = QENT_FREE;
++    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
++        dmabuf_len_set(be->dh[i], 0);
++    bq_put_free(bp, be);
++    pthread_mutex_unlock(&bp->lock);
++    sem_post(&bp->free_sem);
++}
++
++static bool queue_is_inuse(const struct buf_pool *const bp)
++{
++    return bp->inuse.tail != NULL;
++}
++
++static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
++{
++    if (!be)
++        return;
++    pthread_mutex_lock(&bp->lock);
++    ql_add_tail(&bp->inuse, be);
++    be->status = QENT_WAITING;
++    pthread_mutex_unlock(&bp->lock);
++}
++
++static struct qent_base *queue_get_free(struct buf_pool *const bp)
++{
++    struct qent_base *buf;
++
++    if (do_wait(&bp->free_sem))
++        return NULL;
++    pthread_mutex_lock(&bp->lock);
++    buf = bq_get_free(bp);
++    pthread_mutex_unlock(&bp->lock);
++    return buf;
++}
++
++static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
++{
++    struct qent_base *buf;
++
++    if (do_trywait(&bp->free_sem))
++        return NULL;
++    pthread_mutex_lock(&bp->lock);
++    buf = bq_get_free(bp);
++    pthread_mutex_unlock(&bp->lock);
++    return buf;
++}
++
++static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
++{
++    struct qent_base *be;
++
++    pthread_mutex_lock(&bp->lock);
++    /* Expect 1st in Q, but allow anywhere */
++    for (be = bp->inuse.head; be; be = be->next) {
++        if (be->index == index) {
++            bq_extract_inuse(bp, be);
++            break;
++        }
++    }
++    pthread_mutex_unlock(&bp->lock);
++
++    return be;
++}
++
++static void queue_delete(struct buf_pool *const bp)
++{
++    sem_destroy(&bp->free_sem);
++    pthread_mutex_destroy(&bp->lock);
++    free(bp);
++}
++
++static struct buf_pool* queue_new(const int vfd)
++{
++    struct buf_pool *bp = calloc(1, sizeof(*bp));
++    if (!bp)
++        return NULL;
++    pthread_mutex_init(&bp->lock, NULL);
++    sem_init(&bp->free_sem, 0, 0);
++    return bp;
++}
++
++
++struct mediabufs_ctl {
++    atomic_int ref_count;  /* 0 is single ref for easier atomics */
++    void * dc;
++    int vfd;
++    bool stream_on;
++    bool polling;
++    bool dst_fixed;             // Dst Q is fixed size
++    pthread_mutex_t lock;
++    struct buf_pool * src;
++    struct buf_pool * dst;
++    struct polltask * pt;
++    struct pollqueue * pq;
++    struct ff_weak_link_master * this_wlm;
++
++    enum mediabufs_memory src_memtype;
++    enum mediabufs_memory dst_memtype;
++    struct v4l2_format src_fmt;
++    struct v4l2_format dst_fmt;
++    struct v4l2_capability capability;
++};
++
++static int qe_v4l2_queue(struct qent_base *const be,
++               const int vfd, struct media_request *const mreq,
++               const struct v4l2_format *const fmt,
++               const bool is_dst, const bool hold_flag)
++{
++    struct v4l2_buffer buffer = {
++        .type = fmt->type,
++        .memory = mediabufs_memory_to_v4l2(be->memtype),
++        .index = be->index
++    };
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        unsigned int i;
++        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++            if (is_dst)
++                dmabuf_len_set(be->dh[i], 0);
++
++            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
++            planes[i].length = dmabuf_size(be->dh[i]);
++            planes[i].bytesused = dmabuf_len(be->dh[i]);
++            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
++                planes[i].m.fd = dmabuf_fd(be->dh[i]);
++            else
++                planes[i].m.mem_offset = 0;
++        }
++        buffer.m.planes = planes;
++        buffer.length = i;
++    }
++    else {
++        if (is_dst)
++            dmabuf_len_set(be->dh[0], 0);
++
++        buffer.bytesused = dmabuf_len(be->dh[0]);
++        buffer.length = dmabuf_size(be->dh[0]);
++        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
++            buffer.m.fd = dmabuf_fd(be->dh[0]);
++        else
++            buffer.m.offset = 0;
++    }
++
++    if (!is_dst && mreq) {
++        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
++        buffer.request_fd = media_request_fd(mreq);
++        if (hold_flag)
++            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
++    }
++
++    if (is_dst)
++        be->timestamp = (struct timeval){0,0};
++
++    buffer.timestamp = be->timestamp;
++
++    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
++        const int err = errno;
++        if (err != EINTR) {
++            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
++            return -err;
++        }
++    }
++    return 0;
++}
++
++static struct qent_base * qe_dequeue(struct buf_pool *const bp,
++                     const int vfd,
++                     const struct v4l2_format * const f)
++{
++    struct qent_base *be;
++    int rc;
++    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
++    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
++    struct v4l2_buffer buffer = {
++        .type =  f->type,
++        .memory = mediabufs_memory_to_v4l2(bp->memtype)
++    };
++    if (mp) {
++        buffer.length = f->fmt.pix_mp.num_planes;
++        buffer.m.planes = planes;
++    }
++
++    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
++           errno == EINTR)
++        /* Loop */;
++    if (rc) {
++        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
++        return NULL;
++    }
++
++    be = queue_find_extract_index(bp, buffer.index);
++    if (!be) {
++        request_log("Failed to find index %d in Q\n", buffer.index);
++        return NULL;
++    }
++
++    if (mp) {
++        unsigned int i;
++        for (i = 0; i != buffer.length; ++i)
++            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
++    }
++    else
++        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
++
++    be->timestamp = buffer.timestamp;
++    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
++    return be;
++}
++
++static void qe_dst_done(struct qent_dst * dst_be)
++{
++    pthread_mutex_lock(&dst_be->lock);
++    dst_be->waiting = false;
++    pthread_cond_broadcast(&dst_be->cond);
++    pthread_mutex_unlock(&dst_be->lock);
++
++    qent_dst_unref(&dst_be);
++}
++
++static bool qe_dst_waiting(struct qent_dst *const dst_be)
++{
++    bool waiting;
++    pthread_mutex_lock(&dst_be->lock);
++    waiting = dst_be->waiting;
++    dst_be->waiting = true;
++    pthread_mutex_unlock(&dst_be->lock);
++    return waiting;
++}
++
++
++static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
++{
++    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
++}
++
++static void mediabufs_poll_cb(void * v, short revents)
++{
++    struct mediabufs_ctl *mbc = v;
++    struct qent_src *src_be = NULL;
++    struct qent_dst *dst_be = NULL;
++
++    if (!revents)
++        request_err(mbc->dc, "%s: Timeout\n", __func__);
++
++    pthread_mutex_lock(&mbc->lock);
++    mbc->polling = false;
++
++    if ((revents & POLLOUT) != 0)
++        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
++    if ((revents & POLLIN) != 0)
++        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
++
++    /* Reschedule */
++    if (mediabufs_wants_poll(mbc)) {
++        mbc->polling = true;
++        pollqueue_add_task(mbc->pt, 2000);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++
++    if (src_be)
++        queue_put_free(mbc->src, &src_be->base);
++    if (dst_be)
++        qe_dst_done(dst_be);
++}
++
++int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
++{
++    struct qent_base *const be = &be_src->base;
++
++    be->timestamp = *timestamp;
++    return 0;
++}
++
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
++{
++    return be_dst->base.timestamp;
++}
++
++static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
++        size_t newsize = round_up_size(len);
++        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
++        if (!dbsc) {
++            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
++            return -ENOMEM;
++        }
++        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
++            request_log("%s: Realloc %zd failed\n", __func__, newsize);
++            return -ENOMEM;
++        }
++    }
++    return 0;
++}
++
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    struct qent_base *const be = &be_src->base;
++    return qent_base_realloc(be, len, dbsc);
++}
++
++
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
++{
++    void * dst;
++    struct qent_base *const be = &be_src->base;
++    int rv;
++
++    // Realloc doesn't copy so don't alloc if offset != 0
++    if ((rv = qent_base_realloc(be, offset + len,
++                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
++        return rv;
++
++    dmabuf_write_start(be->dh[0]);
++    dst = dmabuf_map(be->dh[0]);
++    if (!dst)
++        return -1;
++    memcpy((char*)dst + offset, src, len);
++    dmabuf_len_set(be->dh[0], len);
++    dmabuf_write_end(be->dh[0]);
++    return 0;
++}
++
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
++{
++    const struct qent_base *const be = &be_dst->base;
++
++    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
++}
++
++int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
++{
++    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
++}
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++                struct media_request **const pmreq,
++                struct qent_src **const psrc_be,
++                struct qent_dst *const dst_be,
++                const bool is_final)
++{
++    struct media_request * mreq = *pmreq;
++    struct qent_src *const src_be = *psrc_be;
++
++    // Req & src are always both "consumed"
++    *pmreq = NULL;
++    *psrc_be = NULL;
++
++    pthread_mutex_lock(&mbc->lock);
++
++    if (!src_be)
++        goto fail1;
++
++    if (dst_be) {
++        if (qe_dst_waiting(dst_be)) {
++            request_info(mbc->dc, "Request buffer already waiting on start\n");
++            goto fail1;
++        }
++        dst_be->base.timestamp = (struct timeval){0,0};
++        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
++            goto fail1;
++
++        qent_dst_ref(dst_be);
++        queue_put_inuse(mbc->dst, &dst_be->base);
++    }
++
++    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
++        goto fail1;
++    queue_put_inuse(mbc->src, &src_be->base);
++
++    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
++        mbc->polling = true;
++        pollqueue_add_task(mbc->pt, 2000);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++
++    if (media_request_start(mreq))
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail1:
++    media_request_abort(&mreq);
++    if (src_be)
++        queue_put_free(mbc->src, &src_be->base);
++
++// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
++    if (dst_be) {
++        dst_be->base.status = QENT_ERROR;
++        qe_dst_done(dst_be);
++    }
++    pthread_mutex_unlock(&mbc->lock);
++    return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++static int qe_alloc_from_fmt(struct qent_base *const be,
++                   struct dmabufs_ctl *const dbsc,
++                   const struct v4l2_format *const fmt)
++{
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        unsigned int i;
++        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
++            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
++                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
++            /* On failure tidy up and die */
++            if (!be->dh[i]) {
++                while (i--) {
++                    dmabuf_free(be->dh[i]);
++                    be->dh[i] = NULL;
++                }
++                return -1;
++            }
++        }
++    }
++    else {
++//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
++        size_t size = fmt->fmt.pix.sizeimage;
++        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
++        if (!be->dh[0])
++            return -1;
++    }
++    return 0;
++}
++
++static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
++            const enum v4l2_buf_type buftype,
++            uint32_t pixfmt,
++            const unsigned int width, const unsigned int height,
++                               const size_t bufsize)
++{
++    *fmt = (struct v4l2_format){.type = buftype};
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++        fmt->fmt.pix_mp.width = width;
++        fmt->fmt.pix_mp.height = height;
++        fmt->fmt.pix_mp.pixelformat = pixfmt;
++        if (bufsize) {
++            fmt->fmt.pix_mp.num_planes = 1;
++            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
++        }
++    }
++    else {
++        fmt->fmt.pix.width = width;
++        fmt->fmt.pix.height = height;
++        fmt->fmt.pix.pixelformat = pixfmt;
++        fmt->fmt.pix.sizeimage = bufsize;
++    }
++
++    while (ioctl(fd, VIDIOC_S_FMT, fmt))
++        if (errno != EINTR)
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    // Treat anything where we don't get at least what we asked for as a fail
++    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
++        if (fmt->fmt.pix_mp.width < width ||
++            fmt->fmt.pix_mp.height < height ||
++            fmt->fmt.pix_mp.pixelformat != pixfmt) {
++            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++    }
++    else {
++        if (fmt->fmt.pix.width < width ||
++            fmt->fmt.pix.height < height ||
++            fmt->fmt.pix.pixelformat != pixfmt) {
++            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++    }
++
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
++                   const int fd,
++                   const unsigned int type_v4l2,
++                   const uint32_t flags_must,
++                   const uint32_t flags_not,
++                   const unsigned int width,
++                   const unsigned int height,
++                   mediabufs_dst_fmt_accept_fn *const accept_fn,
++                   void *const accept_v)
++{
++    unsigned int i;
++
++    for (i = 0;; ++i) {
++        struct v4l2_fmtdesc fmtdesc = {
++            .index = i,
++            .type = type_v4l2
++        };
++        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
++            if (errno != EINTR)
++                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
++        }
++        if ((fmtdesc.flags & flags_must) != flags_must ||
++            (fmtdesc.flags & flags_not))
++            continue;
++        if (!accept_fn(accept_v, &fmtdesc))
++            continue;
++
++        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
++                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
++            return MEDIABUFS_STATUS_SUCCESS;
++    }
++    return 0;
++}
++
++
++/* Wait for qent done */
++
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    enum qent_status estat;
++
++    pthread_mutex_lock(&be_dst->lock);
++    while (be_dst->waiting &&
++           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
++        /* Loop */;
++    estat = be->status;
++    pthread_mutex_unlock(&be_dst->lock);
++
++    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
++        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
++            MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
++{
++    struct qent_base *const be = &be_dst->base;
++    return dmabuf_map(be->dh[buf_no]);
++}
++
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    unsigned int i;
++    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++        if (dmabuf_read_start(be->dh[i])) {
++            while (i--)
++                dmabuf_read_end(be->dh[i]);
++            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++        }
++    }
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
++{
++    struct qent_base *const be = &be_dst->base;
++    unsigned int i;
++    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
++        if (dmabuf_read_end(be->dh[i]))
++            status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++    return status;
++}
++
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
++{
++    if (be_dst)
++        atomic_fetch_add(&be_dst->base.ref_count, 1);
++    return be_dst;
++}
++
++void qent_dst_unref(struct qent_dst ** const pbe_dst)
++{
++    struct qent_dst * const be_dst = *pbe_dst;
++    struct mediabufs_ctl * mbc;
++    if (!be_dst)
++        return;
++    *pbe_dst = NULL;
++
++    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
++        return;
++
++    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
++        queue_put_free(mbc->dst, &be_dst->base);
++        ff_weak_link_unlock(be_dst->mbc_wl);
++    }
++    else {
++        qe_dst_free(be_dst);
++    }
++}
++
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++                unsigned int plane,
++                int fd, size_t size)
++{
++    struct qent_base *const be = &be_dst->base;
++    struct dmabuf_h * dh;
++
++    if (be->status != QENT_IMPORT || be->dh[plane])
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    dh = dmabuf_import(fd, size);
++    if (!dh)
++        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++    be->dh[plane] = dh;
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++// Returns noof buffers created, -ve for error
++static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
++{
++    unsigned int i;
++
++    struct v4l2_create_buffers cbuf = {
++        .count = n,
++        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
++        .format = mbc->dst_fmt,
++    };
++
++    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
++        const int err = -errno;
++        if (err != EINTR) {
++            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
++            return -err;
++        }
++    }
++
++    if (cbuf.count != n)
++        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
++
++    for (i = 0; i != cbuf.count; ++i)
++        qes[i]->base.index = cbuf.index + i;
++
++    return cbuf.count;
++}
++
++static MediaBufsStatus
++qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
++                   const unsigned int n, const bool x_dmabuf)
++{
++    struct v4l2_buffer buf = {
++        .index = n,
++        .type = fmt->type,
++    };
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    int ret;
++
++    if (be->dh[0])
++        return 0;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        memset(planes, 0, sizeof(planes));
++        buf.m.planes = planes;
++        buf.length = VIDEO_MAX_PLANES;
++    }
++
++    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
++        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
++    {
++        unsigned int i;
++        for (i = 0; i != buf.length; ++i) {
++            if (x_dmabuf) {
++                struct v4l2_exportbuffer xbuf = {
++                    .type = buf.type,
++                    .index = buf.index,
++                    .plane = i,
++                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
++                };
++                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
++                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
++            }
++            else {
++                be->dh[i] = dmabuf_import_mmap(
++                    mmap(NULL, planes[i].length,
++                        PROT_READ | PROT_WRITE,
++                        MAP_SHARED | MAP_POPULATE,
++                        mbc->vfd, planes[i].m.mem_offset),
++                    planes[i].length);
++            }
++            /* On failure tidy up and die */
++            if (!be->dh[i]) {
++                while (i--) {
++                    dmabuf_free(be->dh[i]);
++                    be->dh[i] = NULL;
++                }
++                return MEDIABUFS_ERROR_OPERATION_FAILED;
++            }
++        }
++    }
++    else
++    {
++        if (x_dmabuf) {
++            struct v4l2_exportbuffer xbuf = {
++                .type = buf.type,
++                .index = buf.index,
++                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
++            };
++            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
++                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
++        }
++        else {
++            be->dh[0] = dmabuf_import_mmap(
++                mmap(NULL, buf.length,
++                    PROT_READ | PROT_WRITE,
++                    MAP_SHARED | MAP_POPULATE,
++                    mbc->vfd, buf.m.offset),
++                buf.length);
++        }
++        /* On failure tidy up and die */
++        if (!be->dh[0]) {
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++        }
++    }
++
++    return 0;
++}
++
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
++{
++    struct qent_dst * be_dst;
++
++    if (mbc == NULL) {
++        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
++        if (be_dst)
++            be_dst->base.status = QENT_IMPORT;
++        return be_dst;
++    }
++
++    if (mbc->dst_fixed) {
++        be_dst = base_to_dst(queue_get_free(mbc->dst));
++        if (!be_dst)
++            return NULL;
++    }
++    else {
++        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
++        if (!be_dst) {
++            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
++            if (!be_dst)
++                return NULL;
++
++            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
++                qe_dst_free(be_dst);
++                return NULL;
++            }
++        }
++    }
++
++    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
++        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
++            request_err(mbc->dc, "Failed to export as dmabuf\n");
++            queue_put_free(mbc->dst, &be_dst->base);
++            return NULL;
++        }
++    }
++    else {
++        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
++            /* Given  how create buf works we can't uncreate it on alloc failure
++             * all we can do is put it on the free Q
++            */
++            queue_put_free(mbc->dst, &be_dst->base);
++            return NULL;
++        }
++    }
++
++    be_dst->base.status = QENT_PENDING;
++    atomic_store(&be_dst->base.ref_count, 0);
++    return be_dst;
++}
++
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
++{
++    return &mbc->dst_fmt;
++}
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++               const unsigned int width,
++               const unsigned int height,
++               mediabufs_dst_fmt_accept_fn *const accept_fn,
++               void *const accept_v)
++{
++    MediaBufsStatus status;
++    unsigned int i;
++    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
++    static const struct {
++        unsigned int flags_must;
++        unsigned int flags_not;
++    } trys[] = {
++        {0, V4L2_FMT_FLAG_EMULATED},
++        {V4L2_FMT_FLAG_EMULATED, 0},
++    };
++    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
++        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
++                                buf_type,
++                                trys[i].flags_must,
++                                trys[i].flags_not,
++                                width, height, accept_fn, accept_v);
++        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
++            return status;
++    }
++
++    if (status != MEDIABUFS_STATUS_SUCCESS)
++        return status;
++
++    /* Try to create a buffer - don't alloc */
++    return status;
++}
++
++// ** This is a mess if we get partial alloc but without any way to remove
++//    individual V4L2 Q members we are somewhat stuffed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
++{
++    unsigned int i;
++    int a = 0;
++    unsigned int qc;
++    struct qent_dst * qes[32];
++
++    if (n > 32)
++        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++
++    mbc->dst->memtype = memtype;
++
++    // Create qents first as it is hard to get rid of the V4L2 buffers on error
++    for (qc = 0; qc != n; ++qc)
++    {
++        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
++            goto fail;
++    }
++
++    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
++        goto fail;
++
++    for (i = 0; i != a; ++i)
++        queue_put_free(mbc->dst, &qes[i]->base);
++
++    if (a != n)
++        goto fail;
++
++    mbc->dst_fixed = fixed;
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++    for (i = (a < 0 ? 0 : a); i != qc; ++i)
++        qe_dst_free(qes[i]);
++
++    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
++}
++
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
++{
++    struct qent_base * buf = queue_get_free(mbc->src);
++    buf->status = QENT_PENDING;
++    return base_to_src(buf);
++}
++
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
++{
++    struct qent_src *const qe_src = *pqe_src;
++    if (!qe_src)
++        return;
++    *pqe_src = NULL;
++    queue_put_free(mbc->src, &qe_src->base);
++}
++
++static MediaBufsStatus
++chk_memory_type(struct mediabufs_ctl *const mbc,
++    const struct v4l2_format * const f,
++    const enum mediabufs_memory m)
++{
++    struct v4l2_create_buffers cbuf = {
++        .count = 0,
++        .memory = V4L2_MEMORY_MMAP,
++        .format = *f
++    };
++
++    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++
++    switch (m) {
++    case MEDIABUFS_MEMORY_DMABUF:
++        // 0 = Unknown but assume not in that case
++        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
++            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
++        break;
++    case MEDIABUFS_MEMORY_MMAP:
++        break;
++    default:
++        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
++    }
++
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus
++mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
++{
++    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
++}
++
++MediaBufsStatus
++mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
++{
++    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
++}
++
++/* src format must have been set up before this */
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
++                  struct dmabufs_ctl * const dbsc,
++                  unsigned int n, const enum mediabufs_memory memtype)
++{
++    unsigned int i;
++    struct v4l2_requestbuffers req = {
++        .count = n,
++        .type = mbc->src_fmt.type,
++        .memory = mediabufs_memory_to_v4l2(memtype)
++    };
++
++    bq_free_all_free_src(mbc->src);
++
++    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
++        if (errno != EINTR) {
++            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
++            return MEDIABUFS_ERROR_OPERATION_FAILED;
++        }
++    }
++
++    if (n > req.count) {
++        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
++        n = req.count;
++    }
++
++    for (i = 0; i != n; ++i) {
++        struct qent_src *const be_src = qe_src_new(memtype);
++        if (!be_src) {
++            request_err(mbc->dc, "Failed to create src be %d\n", i);
++            goto fail;
++        }
++        switch (memtype) {
++        case MEDIABUFS_MEMORY_MMAP:
++            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
++                qe_src_free(be_src);
++                goto fail;
++            }
++            be_src->fixed_size = 1;
++            break;
++        case MEDIABUFS_MEMORY_DMABUF:
++            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
++                qe_src_free(be_src);
++                goto fail;
++            }
++            be_src->fixed_size = !mediabufs_src_resizable(mbc);
++            break;
++        default:
++            request_err(mbc->dc, "Unexpected memorty type\n");
++            goto fail;
++        }
++        be_src->base.index = i;
++
++        queue_put_free(mbc->src, &be_src->base);
++    }
++
++    mbc->src->memtype = memtype;
++    return MEDIABUFS_STATUS_SUCCESS;
++
++fail:
++    bq_free_all_free_src(mbc->src);
++    req.count = 0;
++    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
++           errno == EINTR)
++        /* Loop */;
++
++    return MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++
++
++/*
++ * Set stuff order:
++ *  Set src fmt
++ *  Set parameters (sps) on vfd
++ *  Negotiate dst format (dst_fmt_set)
++ *  Create src buffers
++ *  Alloc a dst buffer or Create dst slots
++*/
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
++{
++    if (mbc->stream_on)
++        return MEDIABUFS_STATUS_SUCCESS;
++
++    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
++        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
++        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
++        set_stream(mbc->vfd, mbc->src_fmt.type, false);
++        return MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    mbc->stream_on = true;
++    return MEDIABUFS_STATUS_SUCCESS;
++}
++
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
++{
++    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
++
++    if (!mbc->stream_on)
++        return MEDIABUFS_STATUS_SUCCESS;
++
++    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
++        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
++        status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
++        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
++        status = MEDIABUFS_ERROR_OPERATION_FAILED;
++    }
++
++    mbc->stream_on = false;
++    return status;
++}
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
++{
++    struct v4l2_ext_controls controls = {
++        .controls = control_array,
++        .count = n
++    };
++
++    if (mreq) {
++        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
++        controls.request_fd = media_request_fd(mreq);
++    }
++
++    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
++    {
++        const int err = errno;
++        if (err != EINTR) {
++            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
++            return -err;
++        }
++    }
++
++    return 0;
++}
++
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++                struct media_request * const mreq,
++                unsigned int id, void *data,
++                unsigned int size)
++{
++    struct v4l2_ext_control control = {
++        .id = id,
++        .ptr = data,
++        .size = size
++    };
++
++    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
++    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
++}
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++                                      enum v4l2_buf_type buf_type,
++                   const uint32_t pixfmt,
++                   const uint32_t width, const uint32_t height,
++                                      const size_t bufsize)
++{
++    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
++    if (rv != MEDIABUFS_STATUS_SUCCESS)
++        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
++
++    return rv;
++}
++
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
++{
++    int rv = 0;
++    while (n--) {
++        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
++            const int err = errno;
++            if (err != EINTR) {
++                // Often used for probing - errors are to be expected
++                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
++                ctrls->type = 0; // 0 is invalid
++                rv = -err;
++                break;
++            }
++        }
++        ++ctrls;
++    }
++    return rv;
++}
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
++{
++#if 1
++    return 0;
++#else
++    // Single planar OUTPUT can only take exact size buffers
++    // Multiplanar will take larger than negotiated
++    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
++#endif
++}
++
++static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
++{
++    if (!mbc)
++        return;
++
++    // Break the weak link first
++    ff_weak_link_break(&mbc->this_wlm);
++
++    polltask_delete(&mbc->pt);
++
++    mediabufs_stream_off(mbc);
++
++    // Empty v4l2 buffer stash
++    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
++    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
++
++    bq_free_all_free_src(mbc->src);
++    bq_free_all_inuse_src(mbc->src);
++    bq_free_all_free_dst(mbc->dst);
++
++    {
++        struct qent_dst *dst_be;
++        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
++            dst_be->base.timestamp = (struct timeval){0};
++            dst_be->base.status = QENT_ERROR;
++            qe_dst_done(dst_be);
++        }
++    }
++
++    queue_delete(mbc->dst);
++    queue_delete(mbc->src);
++    close(mbc->vfd);
++    pthread_mutex_destroy(&mbc->lock);
++
++    free(mbc);
++}
++
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
++{
++    atomic_fetch_add(&mbc->ref_count, 1);
++    return mbc;
++}
++
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
++{
++    struct mediabufs_ctl *const mbc = *pmbc;
++    int n;
++
++    if (!mbc)
++        return;
++    *pmbc = NULL;
++    n = atomic_fetch_sub(&mbc->ref_count, 1);
++    if (n)
++        return;
++    mediabufs_ctl_delete(mbc);
++}
++
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
++{
++    return mbc->capability.version;
++}
++
++static int set_capabilities(struct mediabufs_ctl *const mbc)
++{
++    uint32_t caps;
++
++    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
++        int err = errno;
++        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
++        return -err;
++    }
++
++    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
++            mbc->capability.device_caps :
++            mbc->capability.capabilities;
++
++    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
++        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
++        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
++    }
++    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
++        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++    }
++    else {
++        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
++        return -EINVAL;
++    }
++
++    return 0;
++}
++
++/* One of these per context */
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
++{
++    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
++
++    if (!mbc)
++        return NULL;
++
++    mbc->dc = dc;
++    // Default mono planar
++    mbc->pq = pq;
++    pthread_mutex_init(&mbc->lock, NULL);
++
++    /* Pick a default  - could we scan for this? */
++    if (vpath == NULL)
++        vpath = "/dev/media0";
++
++    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
++    {
++        const int err = errno;
++        if (err != EINTR) {
++            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
++            goto fail0;
++        }
++    }
++
++    if (set_capabilities(mbc)) {
++        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
++        goto fail1;
++    }
++
++    mbc->src = queue_new(mbc->vfd);
++    if (!mbc->src)
++        goto fail1;
++    mbc->dst = queue_new(mbc->vfd);
++    if (!mbc->dst)
++        goto fail2;
++    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
++    if (!mbc->pt)
++        goto fail3;
++    mbc->this_wlm = ff_weak_link_new(mbc);
++    if (!mbc->this_wlm)
++        goto fail4;
++
++    /* Cannot add polltask now - polling with nothing pending
++     * generates infinite error polls
++    */
++    return mbc;
++
++fail4:
++    polltask_delete(&mbc->pt);
++fail3:
++    queue_delete(mbc->dst);
++fail2:
++    queue_delete(mbc->src);
++fail1:
++    close(mbc->vfd);
++fail0:
++    free(mbc);
++    request_info(dc, "%s: FAILED\n", __func__);
++    return NULL;
++}
++
++
++
+diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
+new file mode 100644
+index 0000000000..890947b2e2
+--- /dev/null
++++ b/libavcodec/v4l2_req_media.h
+@@ -0,0 +1,171 @@
++/*
++e.h
++*
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the
++ * "Software"), to deal in the Software without restriction, including
++ * without limitation the rights to use, copy, modify, merge, publish,
++ * distribute, sub license, and/or sell copies of the Software, and to
++ * permit persons to whom the Software is furnished to do so, subject to
++ * the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the
++ * next paragraph) shall be included in all copies or substantial portions
++ * of the Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
++ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
++ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
++ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
++ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
++ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
++ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
++ */
++
++#ifndef _MEDIA_H_
++#define _MEDIA_H_
++
++#include <stdbool.h>
++#include <stdint.h>
++
++struct v4l2_format;
++struct v4l2_fmtdesc;
++struct v4l2_query_ext_ctrl;
++
++struct pollqueue;
++struct media_request;
++struct media_pool;
++
++typedef enum media_buf_status {
++    MEDIABUFS_STATUS_SUCCESS = 0,
++    MEDIABUFS_ERROR_OPERATION_FAILED,
++    MEDIABUFS_ERROR_DECODING_ERROR,
++    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
++    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
++    MEDIABUFS_ERROR_ALLOCATION_FAILED,
++    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
++} MediaBufsStatus;
++
++struct media_pool * media_pool_new(const char * const media_path,
++                   struct pollqueue * const pq,
++                   const unsigned int n);
++void media_pool_delete(struct media_pool ** pmp);
++
++// Obtain a media request
++// Will block if none availible - has a 2sec timeout
++struct media_request * media_request_get(struct media_pool * const mp);
++int media_request_fd(const struct media_request * const req);
++
++// Start this request
++// Request structure is returned to pool once done
++int media_request_start(struct media_request * const req);
++
++// Return an *unstarted* media_request to the pool
++// May later be upgraded to allow for aborting a started req
++int media_request_abort(struct media_request ** const preq);
++
++
++struct mediabufs_ctl;
++struct qent_src;
++struct qent_dst;
++struct dmabuf_h;
++struct dmabufs_ctl;
++
++// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
++enum mediabufs_memory {
++   MEDIABUFS_MEMORY_UNSET            = 0,
++   MEDIABUFS_MEMORY_MMAP             = 1,
++   MEDIABUFS_MEMORY_USERPTR          = 2,
++   MEDIABUFS_MEMORY_OVERLAY          = 3,
++   MEDIABUFS_MEMORY_DMABUF           = 4,
++};
++
++int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
++struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
++
++// prealloc
++int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
++// dbsc may be NULL if realloc not required
++int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
++const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
++int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
++MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
++void qent_dst_delete(struct qent_dst *const be);
++// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
++void qent_dst_unref(struct qent_dst ** const pbe_dst);
++struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
++
++const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
++MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
++MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
++/* Import an fd unattached to any mediabuf */
++MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
++                unsigned int plane,
++                int fd, size_t size);
++
++const char * mediabufs_memory_name(const enum mediabufs_memory m);
++
++MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
++                struct media_request **const pmreq,
++                struct qent_src **const psrc_be,
++                struct qent_dst *const dst_be,
++                const bool is_final);
++// Get / alloc a dst buffer & associate with a slot
++// If the dst pool is empty then behaviour depends on the fixed flag passed to
++// dst_slots_create.  Default is !fixed = unlimited alloc
++struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
++                           struct dmabufs_ctl *const dbsc);
++// Create dst slots without alloc
++// If fixed true then qent_alloc will only get slots from this pool and will
++// block until a qent has been unrefed
++MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);
++
++MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
++MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
++const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
++
++typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
++
++MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
++               const unsigned int width,
++               const unsigned int height,
++               mediabufs_dst_fmt_accept_fn *const accept_fn,
++               void *const accept_v);
++struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
++void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
++
++int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
++                                struct v4l2_ext_control control_array[], unsigned int n);
++MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
++                struct media_request * const mreq,
++                unsigned int id, void *data,
++                unsigned int size);
++int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
++
++int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
++
++MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
++                                      enum v4l2_buf_type buf_type,
++                                      const uint32_t pixfmt,
++                                      const uint32_t width, const uint32_t height,
++                                      const size_t bufsize);
++
++MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
++                  struct dmabufs_ctl * const dbsc,
++                  unsigned int n,
++                  const enum mediabufs_memory memtype);
++
++// Want to have appropriate formats set first
++MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
++MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
++
++#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
++unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
++
++struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
++                     const char *vpath, struct pollqueue *const pq);
++void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
++struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
++
++
++#endif
+diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
+new file mode 100644
+index 0000000000..cc8a5d4001
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.c
+@@ -0,0 +1,361 @@
++#include <errno.h>
++#include <limits.h>
++#include <poll.h>
++#include <pthread.h>
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <stdbool.h>
++#include <stdlib.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <string.h>
++#include <unistd.h>
++#include <sys/eventfd.h>
++
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_utils.h"
++
++
++struct pollqueue;
++
++enum polltask_state {
++    POLLTASK_UNQUEUED = 0,
++    POLLTASK_QUEUED,
++    POLLTASK_RUNNING,
++    POLLTASK_Q_KILL,
++    POLLTASK_RUN_KILL,
++};
++
++struct polltask {
++    struct polltask *next;
++    struct polltask *prev;
++    struct pollqueue *q;
++    enum polltask_state state;
++
++    int fd;
++    short events;
++
++    void (*fn)(void *v, short revents);
++    void * v;
++
++    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
++    sem_t kill_sem;
++};
++
++struct pollqueue {
++    atomic_int ref_count;
++    pthread_mutex_t lock;
++
++    struct polltask *head;
++    struct polltask *tail;
++
++    bool kill;
++    bool no_prod;
++    int prod_fd;
++    struct polltask *prod_pt;
++    pthread_t worker;
++};
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++                              const int fd, const short events,
++                  void (*const fn)(void *v, short revents),
++                  void *const v)
++{
++    struct polltask *pt;
++
++    if (!events)
++        return NULL;
++
++    pt = malloc(sizeof(*pt));
++    if (!pt)
++        return NULL;
++
++    *pt = (struct polltask){
++        .next = NULL,
++        .prev = NULL,
++        .q = pollqueue_ref(pq),
++        .fd = fd,
++        .events = events,
++        .fn = fn,
++        .v = v
++    };
++
++    sem_init(&pt->kill_sem, 0, 0);
++
++    return pt;
++}
++
++static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
++{
++    if (pt->prev)
++        pt->prev->next = pt->next;
++    else
++        pq->head = pt->next;
++    if (pt->next)
++        pt->next->prev = pt->prev;
++    else
++        pq->tail = pt->prev;
++    pt->next = NULL;
++    pt->prev = NULL;
++}
++
++static void polltask_free(struct polltask * const pt)
++{
++    sem_destroy(&pt->kill_sem);
++    free(pt);
++}
++
++static int pollqueue_prod(const struct pollqueue *const pq)
++{
++    static const uint64_t one = 1;
++    return write(pq->prod_fd, &one, sizeof(one));
++}
++
++void polltask_delete(struct polltask **const ppt)
++{
++    struct polltask *const pt = *ppt;
++    struct pollqueue * pq;
++    enum polltask_state state;
++    bool prodme;
++
++    if (!pt)
++        return;
++
++    pq = pt->q;
++    pthread_mutex_lock(&pq->lock);
++    state = pt->state;
++    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
++    prodme = !pq->no_prod;
++    pthread_mutex_unlock(&pq->lock);
++
++    if (state != POLLTASK_UNQUEUED) {
++        if (prodme)
++            pollqueue_prod(pq);
++        while (sem_wait(&pt->kill_sem) && errno == EINTR)
++            /* loop */;
++    }
++
++    // Leave zapping the ref until we have DQed the PT as might well be
++    // legitimately used in it
++    *ppt = NULL;
++    polltask_free(pt);
++    pollqueue_unref(&pq);
++}
++
++static uint64_t pollqueue_now(int timeout)
++{
++    struct timespec now;
++    uint64_t now_ms;
++
++    if (clock_gettime(CLOCK_MONOTONIC, &now))
++        return 0;
++    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
++    return now_ms ? now_ms : (uint64_t)1;
++}
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout)
++{
++    bool prodme = false;
++    struct pollqueue * const pq = pt->q;
++
++    pthread_mutex_lock(&pq->lock);
++    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
++        if (pq->tail)
++            pq->tail->next = pt;
++        else
++            pq->head = pt;
++        pt->prev = pq->tail;
++        pt->next = NULL;
++        pt->state = POLLTASK_QUEUED;
++        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
++        pq->tail = pt;
++        prodme = !pq->no_prod;
++    }
++    pthread_mutex_unlock(&pq->lock);
++    if (prodme)
++        pollqueue_prod(pq);
++}
++
++static void *poll_thread(void *v)
++{
++    struct pollqueue *const pq = v;
++    struct pollfd *a = NULL;
++    size_t asize = 0;
++
++    pthread_mutex_lock(&pq->lock);
++    do {
++        unsigned int i;
++        unsigned int n = 0;
++        struct polltask *pt;
++        struct polltask *pt_next;
++        uint64_t now = pollqueue_now(0);
++        int timeout = -1;
++        int rv;
++
++        for (pt = pq->head; pt; pt = pt_next) {
++            int64_t t;
++
++            pt_next = pt->next;
++
++            if (pt->state == POLLTASK_Q_KILL) {
++                pollqueue_rem_task(pq, pt);
++                sem_post(&pt->kill_sem);
++                continue;
++            }
++
++            if (n >= asize) {
++                asize = asize ? asize * 2 : 4;
++                a = realloc(a, asize * sizeof(*a));
++                if (!a) {
++                    request_log("Failed to realloc poll array to %zd\n", asize);
++                    goto fail_locked;
++                }
++            }
++
++            a[n++] = (struct pollfd){
++                .fd = pt->fd,
++                .events = pt->events
++            };
++
++            t = (int64_t)(pt->timeout - now);
++            if (pt->timeout && t < INT_MAX &&
++                (timeout < 0 || (int)t < timeout))
++                timeout = (t < 0) ? 0 : (int)t;
++        }
++        pthread_mutex_unlock(&pq->lock);
++
++        if ((rv = poll(a, n, timeout)) == -1) {
++            if (errno != EINTR) {
++                request_log("Poll error: %s\n", strerror(errno));
++                goto fail_unlocked;
++            }
++        }
++
++        pthread_mutex_lock(&pq->lock);
++        now = pollqueue_now(0);
++
++        /* Prodding in this loop is pointless and might lead to
++         * infinite looping
++        */
++        pq->no_prod = true;
++        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
++            pt_next = pt->next;
++
++            /* Pending? */
++            if (a[i].revents ||
++                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
++                pollqueue_rem_task(pq, pt);
++                if (pt->state == POLLTASK_QUEUED)
++                    pt->state = POLLTASK_RUNNING;
++                if (pt->state == POLLTASK_Q_KILL)
++                    pt->state = POLLTASK_RUN_KILL;
++                pthread_mutex_unlock(&pq->lock);
++
++                /* This can add new entries to the Q but as
++                 * those are added to the tail our existing
++                 * chain remains intact
++                */
++                pt->fn(pt->v, a[i].revents);
++
++                pthread_mutex_lock(&pq->lock);
++                if (pt->state == POLLTASK_RUNNING)
++                    pt->state = POLLTASK_UNQUEUED;
++                if (pt->state == POLLTASK_RUN_KILL)
++                    sem_post(&pt->kill_sem);
++            }
++        }
++        pq->no_prod = false;
++
++    } while (!pq->kill);
++
++fail_locked:
++    pthread_mutex_unlock(&pq->lock);
++fail_unlocked:
++    free(a);
++    return NULL;
++}
++
++static void prod_fn(void *v, short revents)
++{
++    struct pollqueue *const pq = v;
++    char buf[8];
++    if (revents)
++        read(pq->prod_fd, buf, 8);
++    if (!pq->kill)
++        pollqueue_add_task(pq->prod_pt, -1);
++}
++
++struct pollqueue * pollqueue_new(void)
++{
++    struct pollqueue *pq = malloc(sizeof(*pq));
++    if (!pq)
++        return NULL;
++    *pq = (struct pollqueue){
++        .ref_count = ATOMIC_VAR_INIT(0),
++        .lock = PTHREAD_MUTEX_INITIALIZER,
++        .head = NULL,
++        .tail = NULL,
++        .kill = false,
++        .prod_fd = -1
++    };
++
++    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
++    if (pq->prod_fd == 1)
++        goto fail1;
++    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
++    if (!pq->prod_pt)
++        goto fail2;
++    pollqueue_add_task(pq->prod_pt, -1);
++    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
++        goto fail3;
++    // Reset ref count which will have been inced by the add_task
++    atomic_store(&pq->ref_count, 0);
++    return pq;
++
++fail3:
++    polltask_free(pq->prod_pt);
++fail2:
++    close(pq->prod_fd);
++fail1:
++    free(pq);
++    return NULL;
++}
++
++static void pollqueue_free(struct pollqueue *const pq)
++{
++    void *rv;
++
++    pthread_mutex_lock(&pq->lock);
++    pq->kill = true;
++    pollqueue_prod(pq);
++    pthread_mutex_unlock(&pq->lock);
++
++    pthread_join(pq->worker, &rv);
++    polltask_free(pq->prod_pt);
++    pthread_mutex_destroy(&pq->lock);
++    close(pq->prod_fd);
++    free(pq);
++}
++
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
++{
++    atomic_fetch_add(&pq->ref_count, 1);
++    return pq;
++}
++
++void pollqueue_unref(struct pollqueue **const ppq)
++{
++    struct pollqueue * const pq = *ppq;
++
++    if (!pq)
++        return;
++    *ppq = NULL;
++
++    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
++        return;
++
++    pollqueue_free(pq);
++}
++
++
++
+diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
+new file mode 100644
+index 0000000000..e1182cb2fc
+--- /dev/null
++++ b/libavcodec/v4l2_req_pollqueue.h
+@@ -0,0 +1,18 @@
++#ifndef POLLQUEUE_H_
++#define POLLQUEUE_H_
++
++struct polltask;
++struct pollqueue;
++
++struct polltask *polltask_new(struct pollqueue *const pq,
++			      const int fd, const short events,
++			      void (*const fn)(void *v, short revents),
++			      void *const v);
++void polltask_delete(struct polltask **const ppt);
++
++void pollqueue_add_task(struct polltask *const pt, const int timeout);
++struct pollqueue * pollqueue_new(void);
++void pollqueue_unref(struct pollqueue **const ppq);
++struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
++
++#endif /* POLLQUEUE_H_ */
+diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
+new file mode 100644
+index 0000000000..a31cc1f4ec
+--- /dev/null
++++ b/libavcodec/v4l2_req_utils.h
+@@ -0,0 +1,27 @@
++#ifndef AVCODEC_V4L2_REQ_UTILS_H
++#define AVCODEC_V4L2_REQ_UTILS_H
++
++#include <stdint.h>
++#include "libavutil/log.h"
++
++#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
++
++#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
++#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
++#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
++#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
++
++static inline char safechar(char c) {
++    return c > 0x20 && c < 0x7f ? c : '.';
++}
++
++static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
++    tbuf[0] = safechar((fcc >>  0) & 0xff);
++    tbuf[1] = safechar((fcc >>  8) & 0xff);
++    tbuf[2] = safechar((fcc >> 16) & 0xff);
++    tbuf[3] = safechar((fcc >> 24) & 0xff);
++    tbuf[4] = '\0';
++    return tbuf;
++}
++
++#endif
+diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
+new file mode 100644
+index 0000000000..fbec16a93e
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.c
+@@ -0,0 +1,347 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#include "config.h"
++#include "decode.h"
++#include "hevcdec.h"
++#include "hwconfig.h"
++
++#include "v4l2_request_hevc.h"
++
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/pixdesc.h"
++
++#include "v4l2_req_devscan.h"
++#include "v4l2_req_dmabufs.h"
++#include "v4l2_req_pollqueue.h"
++#include "v4l2_req_media.h"
++#include "v4l2_req_utils.h"
++
++static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
++{
++    const size_t wxh = w * h;
++    size_t bits_alloc;
++
++    /* Annex A gives a min compression of 2 @ lvl 3.1
++     * (wxh <= 983040) and min 4 thereafter but avoid
++     * the odity of 983041 having a lower limit than
++     * 983040.
++     * Multiply by 3/2 for 4:2:0
++     */
++    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
++        wxh < 983040 * 2 ? 983040 * 3 / 4 :
++        wxh * 3 / 8;
++    /* Allow for bit depth */
++    bits_alloc += (bits_alloc * bits_minus8) / 8;
++    /* Add a few bytes (16k) for overhead */
++    bits_alloc += 0x4000;
++    return bits_alloc;
++}
++
++static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
++                                     av_unused const uint8_t *buffer,
++                                     av_unused uint32_t size)
++{
++    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->start_frame(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->decode_slice(avctx, buffer, size);
++}
++
++static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->end_frame(avctx);
++}
++
++static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    ctx->fns->abort_frame(avctx);
++}
++
++static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->frame_params(avctx, hw_frames_ctx);
++}
++
++static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    return ctx->fns->alloc_frame(avctx, frame);
++}
++
++
++static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
++{
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++
++    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
++
++    mediabufs_ctl_unref(&ctx->mbufs);
++    media_pool_delete(&ctx->mpool);
++    pollqueue_unref(&ctx->pq);
++    dmabufs_ctl_unref(&ctx->dbufs);
++    devscan_delete(&ctx->devscan);
++
++    decode_q_uninit(&ctx->decode_q);
++
++//    if (avctx->hw_frames_ctx) {
++//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
++//        av_buffer_pool_flush(hwfc->pool);
++//    }
++    return 0;
++}
++
++static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
++{
++    AVCodecContext *const avctx = v;
++    const HEVCContext *const h = avctx->priv_data;
++
++    if (h->ps.sps->bit_depth == 8) {
++        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
++            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
++            return 1;
++        }
++    }
++    else if (h->ps.sps->bit_depth == 10) {
++        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
++            return 1;
++        }
++    }
++    return 0;
++}
++
++static int v4l2_request_hevc_init(AVCodecContext *avctx)
++{
++    const HEVCContext *h = avctx->priv_data;
++    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
++    const HEVCSPS * const sps = h->ps.sps;
++    int ret;
++    const struct decdev * decdev;
++    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
++    size_t src_size;
++    enum mediabufs_memory src_memtype;
++    enum mediabufs_memory dst_memtype;
++
++    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    // Give up immediately if this is something that we have no code to deal with
++    if (h->ps.sps->chroma_format_idc != 1) {
++        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
++        return AVERROR_PATCHWELCOME;
++    }
++    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
++        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
++        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
++        return AVERROR_PATCHWELCOME;
++    }
++
++    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
++        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
++        return (AVERROR(-ret));
++    }
++    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
++
++    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
++    {
++        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
++        ret = AVERROR(ENODEV);
++        goto fail0;
++    }
++    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
++           decdev_media_path(decdev), decdev_video_path(decdev));
++
++    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
++        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
++        src_memtype = MEDIABUFS_MEMORY_MMAP;
++        dst_memtype = MEDIABUFS_MEMORY_MMAP;
++    }
++    else {
++        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
++        src_memtype = MEDIABUFS_MEMORY_DMABUF;
++        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
++    }
++
++    if ((ctx->pq = pollqueue_new()) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
++        goto fail1;
++    }
++
++    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
++        goto fail2;
++    }
++
++    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
++        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
++        goto fail3;
++    }
++
++    // Ask for an initial bitbuf size of max size / 4
++    // We will realloc if we need more
++    // Must use sps->h/w as avctx contains cropped size
++retry_src_memtype:
++    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
++    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
++        src_size /= 4;
++    // Kludge for conformance tests which break Annex A limits
++    else if (src_size < 0x40000)
++        src_size = 0x40000;
++
++    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
++                              sps->width, sps->height, src_size)) {
++        char tbuf1[5];
++        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++        goto fail4;
++    }
++
++    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
++        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
++            src_memtype = MEDIABUFS_MEMORY_MMAP;
++            goto retry_src_memtype;
++        }
++        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
++        goto fail4;
++    }
++
++    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
++    }
++#if CONFIG_V4L2_REQ_HEVC_VX
++    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
++    }
++    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
++    }
++    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
++        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
++        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
++    }
++#endif
++    else {
++        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
++        ret = AVERROR(EINVAL);
++        goto fail4;
++    }
++
++    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
++        char tbuf1[5];
++        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
++        goto fail4;
++    }
++
++    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
++        goto fail4;
++    }
++
++    {
++        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
++            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
++        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
++               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
++               avctx->thread_count, avctx->extra_hw_frames);
++
++        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
++            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
++                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
++                goto fail4;
++            }
++            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
++            dst_memtype = MEDIABUFS_MEMORY_MMAP;
++        }
++
++        // extra_hw_frames is -1 if unset
++        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
++            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
++            goto fail4;
++        }
++    }
++
++    if (mediabufs_stream_on(ctx->mbufs)) {
++        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
++        goto fail4;
++    }
++
++    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
++        goto fail4;
++    }
++
++    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
++        goto fail5;
++    }
++
++    decode_q_init(&ctx->decode_q);
++
++    // Set our s/w format
++    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
++
++    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
++           ctx->fns->name,
++           decdev_media_path(decdev), decdev_video_path(decdev),
++           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
++           av_get_pix_fmt_name(avctx->sw_pix_fmt));
++
++    return 0;
++
++fail5:
++    av_buffer_unref(&avctx->hw_frames_ctx);
++fail4:
++    mediabufs_ctl_unref(&ctx->mbufs);
++fail3:
++    media_pool_delete(&ctx->mpool);
++fail2:
++    pollqueue_unref(&ctx->pq);
++fail1:
++    dmabufs_ctl_unref(&ctx->dbufs);
++fail0:
++    devscan_delete(&ctx->devscan);
++    return ret;
++}
++
++const AVHWAccel ff_hevc_v4l2request_hwaccel = {
++    .name           = "hevc_v4l2request",
++    .type           = AVMEDIA_TYPE_VIDEO,
++    .id             = AV_CODEC_ID_HEVC,
++    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
++    .alloc_frame    = v4l2_req_hevc_alloc_frame,
++    .start_frame    = v4l2_req_hevc_start_frame,
++    .decode_slice   = v4l2_req_hevc_decode_slice,
++    .end_frame      = v4l2_req_hevc_end_frame,
++    .abort_frame    = v4l2_req_hevc_abort_frame,
++    .init           = v4l2_request_hevc_init,
++    .uninit         = v4l2_request_hevc_uninit,
++    .priv_data_size = sizeof(V4L2RequestContextHEVC),
++    .frame_params   = v4l2_req_hevc_frame_params,
++    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
++};
+diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
+new file mode 100644
+index 0000000000..99c90064ea
+--- /dev/null
++++ b/libavcodec/v4l2_request_hevc.h
+@@ -0,0 +1,102 @@
++#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
++#define AVCODEC_V4L2_REQUEST_HEVC_H
++
++#include <stdint.h>
++#include <drm_fourcc.h>
++#include "v4l2_req_decode_q.h"
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++// P030 should be defined in drm_fourcc.h and hopefully will be sometime
++// in the future but until then...
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
++#endif
++
++#ifndef DRM_FORMAT_NV15
++#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
++#endif
++
++#ifndef DRM_FORMAT_NV20
++#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
++#endif
++
++#include <linux/videodev2.h>
++#ifndef V4L2_CID_CODEC_BASE
++#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
++#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
++#endif
++
++#define VCAT(name, version) name##_v##version
++#define V2(n,v) VCAT(n, v)
++#define V(n) V2(n, HEVC_CTRLS_VERSION)
++
++#define S2(x) #x
++#define STR(x) S2(x)
++
++// 1 per decoder
++struct v4l2_req_decode_fns;
++
++typedef struct V4L2RequestContextHEVC {
++//    V4L2RequestContext base;
++    const struct v4l2_req_decode_fns * fns;
++
++    unsigned int timestamp;  // ?? maybe uint64_t
++
++    int decode_mode;
++    int start_code;
++    unsigned int max_slices;    // 0 => not wanted (frame mode)
++    unsigned int max_offsets;   // 0 => not wanted
++
++    req_decode_q decode_q;
++
++    struct devscan *devscan;
++    struct dmabufs_ctl *dbufs;
++    struct pollqueue *pq;
++    struct media_pool * mpool;
++    struct mediabufs_ctl *mbufs;
++} V4L2RequestContextHEVC;
++
++typedef struct v4l2_req_decode_fns {
++    int src_pix_fmt_v4l2;
++    const char * name;
++
++    // Init setup
++    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
++
++    // Passthrough of hwaccel fns
++    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
++    int (*end_frame)(AVCodecContext *avctx);
++    void (*abort_frame)(AVCodecContext *avctx);
++    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
++    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
++} v4l2_req_decode_fns;
++
++
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
++extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
++
++#endif
+diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
+index d4ceb60791..fb7f839c5e 100644
+--- a/libavcodec/vc1dec.c
++++ b/libavcodec/vc1dec.c
+@@ -486,7 +486,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
+             size = next - start - 4;
+             if (size <= 0)
+                 continue;
+-            buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
++            buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+             init_get_bits(&gb, buf2, buf2_size * 8);
+             switch (AV_RB32(start)) {
+             case VC1_CODE_SEQHDR:
+@@ -678,7 +678,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                 case VC1_CODE_FRAME:
+                     if (avctx->hwaccel)
+                         buf_start = start;
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     break;
+                 case VC1_CODE_FIELD: {
+                     int buf_size3;
+@@ -695,8 +695,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
+@@ -707,7 +707,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                     break;
+                 }
+                 case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
+-                    buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
++                    buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
+                     init_get_bits(&s->gb, buf2, buf_size2 * 8);
+                     ff_vc1_decode_entry_point(avctx, v, &s->gb);
+                     break;
+@@ -724,8 +724,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                         ret = AVERROR(ENOMEM);
+                         goto err;
+                     }
+-                    buf_size3 = vc1_unescape_buffer(start + 4, size,
+-                                                    slices[n_slices].buf);
++                    buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
++                                                              slices[n_slices].buf);
+                     init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                                   buf_size3 << 3);
+                     slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
+@@ -759,7 +759,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                     ret = AVERROR(ENOMEM);
+                     goto err;
+                 }
+-                buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
++                buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
+                 init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
+                               buf_size3 << 3);
+                 slices[n_slices].mby_start = s->mb_height + 1 >> 1;
+@@ -768,9 +768,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
+                 n_slices1 = n_slices - 1;
+                 n_slices++;
+             }
+-            buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
+         } else {
+-            buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
++            buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
+         }
+         init_get_bits(&s->gb, buf2, buf_size2*8);
+     } else
+diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
+index c25a6f3adf..10182786b3 100644
+--- a/libavcodec/vc1dsp.c
++++ b/libavcodec/vc1dsp.c
+@@ -32,6 +32,7 @@
+ #include "rnd_avg.h"
+ #include "vc1dsp.h"
+ #include "startcode.h"
++#include "vc1_common.h"
+ 
+ /* Apply overlap transform to horizontal edge */
+ static void vc1_v_overlap_c(uint8_t *src, int stride)
+@@ -1028,6 +1029,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
+ #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
+ 
+     dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
++    dsp->vc1_unescape_buffer      = vc1_unescape_buffer;
+ 
+     if (ARCH_AARCH64)
+         ff_vc1dsp_init_aarch64(dsp);
+diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
+index 75db62b1b4..e192b431be 100644
+--- a/libavcodec/vc1dsp.h
++++ b/libavcodec/vc1dsp.h
+@@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
+      * one or more further zero bytes and a one byte.
+      */
+     int (*startcode_find_candidate)(const uint8_t *buf, int size);
++
++    /* Copy a buffer, removing startcode emulation escape bytes as we go */
++    int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
+ } VC1DSPContext;
+ 
+ void ff_vc1dsp_init(VC1DSPContext* c);
+diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
+new file mode 100644
+index 0000000000..5a79e89ed7
+--- /dev/null
++++ b/libavcodec/weak_link.c
+@@ -0,0 +1,103 @@
++#include <stdlib.h>
++#include <pthread.h>
++#include <stdatomic.h>
++#include "weak_link.h"
++
++struct ff_weak_link_master {
++    atomic_int ref_count;    /* 0 is single ref for easier atomics */
++    pthread_rwlock_t lock;
++    void * ptr;
++};
++
++static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
++{
++    return (struct ff_weak_link_master *)c;
++}
++
++struct ff_weak_link_master * ff_weak_link_new(void * p)
++{
++    struct ff_weak_link_master * w = malloc(sizeof(*w));
++    if (!w)
++        return NULL;
++    atomic_init(&w->ref_count, 0);
++    w->ptr = p;
++    if (pthread_rwlock_init(&w->lock, NULL)) {
++        free(w);
++        return NULL;
++    }
++    return w;
++}
++
++static void weak_link_do_unref(struct ff_weak_link_master * const w)
++{
++    int n = atomic_fetch_sub(&w->ref_count, 1);
++    if (n)
++        return;
++
++    pthread_rwlock_destroy(&w->lock);
++    free(w);
++}
++
++// Unref & break link
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
++{
++    struct ff_weak_link_master * const w = *ppLink;
++    if (!w)
++        return;
++
++    *ppLink = NULL;
++    pthread_rwlock_wrlock(&w->lock);
++    w->ptr = NULL;
++    pthread_rwlock_unlock(&w->lock);
++
++    weak_link_do_unref(w);
++}
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
++{
++    if (!w)
++        return NULL;
++    atomic_fetch_add(&w->ref_count, 1);
++    return (struct ff_weak_link_client*)w;
++}
++
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
++{
++    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++    if (!w)
++        return;
++
++    *ppLink = NULL;
++    weak_link_do_unref(w);
++}
++
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
++{
++    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
++
++    if (!w)
++        return NULL;
++
++    if (pthread_rwlock_rdlock(&w->lock))
++        goto broken;
++
++    if (w->ptr)
++        return w->ptr;
++
++    pthread_rwlock_unlock(&w->lock);
++
++broken:
++    *ppLink = NULL;
++    weak_link_do_unref(w);
++    return NULL;
++}
++
++// Ignores a NULL c (so can be on the return path of both broken & live links)
++void ff_weak_link_unlock(struct ff_weak_link_client * c)
++{
++    struct ff_weak_link_master * const w = weak_link_x(c);
++    if (w)
++        pthread_rwlock_unlock(&w->lock);
++}
++
++
+diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
+new file mode 100644
+index 0000000000..415b6a27a0
+--- /dev/null
++++ b/libavcodec/weak_link.h
+@@ -0,0 +1,23 @@
++struct ff_weak_link_master;
++struct ff_weak_link_client;
++
++struct ff_weak_link_master * ff_weak_link_new(void * p);
++void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
++
++struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
++void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
++
++// Returns NULL if link broken - in this case it will also zap
++//   *ppLink and unref the weak_link.
++// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
++//
++// The above does mean that there is a race if this is called simultainiously
++// by two threads using the same weak_link_client (so don't do that)
++void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
++void ff_weak_link_unlock(struct ff_weak_link_client * c);
++
++
++
++
++
++
+diff --git a/libavdevice/Makefile b/libavdevice/Makefile
+index 0dfe47a1f4..ec7c7b4147 100644
+--- a/libavdevice/Makefile
++++ b/libavdevice/Makefile
+@@ -47,6 +47,9 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
+ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
+ OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
+ OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
++OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
++OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
++OBJS-$(CONFIG_VOUT_RPI_OUTDEV)           += rpi_vout.o
+ OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
+ OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
+ 
+diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
+index 92b27a1d14..19d2a9de55 100644
+--- a/libavdevice/alldevices.c
++++ b/libavdevice/alldevices.c
+@@ -53,6 +53,9 @@ extern AVOutputFormat ff_sndio_muxer;
+ extern AVInputFormat  ff_v4l2_demuxer;
+ extern AVOutputFormat ff_v4l2_muxer;
+ extern AVInputFormat  ff_vfwcap_demuxer;
++extern AVOutputFormat ff_vout_drm_muxer;
++extern AVOutputFormat ff_vout_egl_muxer;
++extern AVOutputFormat ff_vout_rpi_muxer;
+ extern AVInputFormat  ff_xcbgrab_demuxer;
+ extern AVOutputFormat ff_xv_muxer;
+ 
+diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
+new file mode 100644
+index 0000000000..c7b90e6dd8
+--- /dev/null
++++ b/libavdevice/drm_vout.c
+@@ -0,0 +1,680 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++//     limited to testing.
++
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <unistd.h>
++
++#include <xf86drm.h>
++#include <xf86drmMode.h>
++#include <drm_fourcc.h>
++
++#define TRACE_ALL 0
++
++#define DRM_MODULE "vc4"
++
++#define ERRSTR strerror(errno)
++
++struct drm_setup {
++   int conId;
++   uint32_t crtcId;
++   int crtcIdx;
++   uint32_t planeId;
++   unsigned int out_fourcc;
++   struct {
++       int x, y, width, height;
++   } compose;
++};
++
++typedef struct drm_aux_s {
++    unsigned int fb_handle;
++    uint32_t bo_handles[AV_DRM_MAX_PLANES];
++    AVFrame * frame;
++} drm_aux_t;
++
++// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
++// we get initial flicker probably due to dodgy drm timing
++#define AUX_SIZE 3
++typedef struct drm_display_env_s
++{
++    AVClass *class;
++
++    int drm_fd;
++    uint32_t con_id;
++    struct drm_setup setup;
++    enum AVPixelFormat avfmt;
++
++    int show_all;
++    const char * drm_module;
++
++    unsigned int ano;
++    drm_aux_t aux[AUX_SIZE];
++
++    pthread_t q_thread;
++    sem_t q_sem_in;
++    sem_t q_sem_out;
++    int q_terminate;
++    AVFrame * q_next;
++
++} drm_display_env_t;
++
++
++static int drm_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++    return 0;
++}
++
++static int drm_vout_write_header(AVFormatContext *s)
++{
++    const AVCodecParameters * const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++    if (   s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++static int find_plane(struct AVFormatContext * const avctx,
++                      const int drmfd, const int crtcidx, const uint32_t format,
++                      uint32_t * const pplane_id)
++{
++   drmModePlaneResPtr planes;
++   drmModePlanePtr plane;
++   drmModeObjectPropertiesPtr props = NULL;
++   drmModePropertyPtr prop = NULL;
++   unsigned int i;
++   unsigned int j;
++   int ret = -1;
++
++   planes = drmModeGetPlaneResources(drmfd);
++   if (!planes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
++       return -1;
++   }
++
++   for (i = 0; i < planes->count_planes; ++i) {
++      plane = drmModeGetPlane(drmfd, planes->planes[i]);
++      if (!planes)
++      {
++          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
++          break;
++      }
++
++      if (!(plane->possible_crtcs & (1 << crtcidx))) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      for (j = 0; j < plane->count_formats; ++j) {
++         if (plane->formats[j] == format)
++            break;
++      }
++
++      if (j == plane->count_formats) {
++         drmModeFreePlane(plane);
++         continue;
++      }
++
++      *pplane_id = plane->plane_id;
++      drmModeFreePlane(plane);
++      break;
++   }
++
++   if (i == planes->count_planes) {
++       ret = -1;
++       goto fail;
++   }
++
++    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
++    if (!props)
++        goto fail;
++    for (i = 0; i != props->count_props; ++i) {
++        if (prop)
++            drmModeFreeProperty(prop);
++        prop = drmModeGetProperty(drmfd, props->props[i]);
++        if (!prop)
++            goto fail;
++        if (strcmp("zpos", prop->name) == 0) {
++            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
++                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
++            else
++                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
++            break;
++        }
++    }
++
++    ret = 0;
++fail:
++    if (props)
++        drmModeFreeObjectProperties(props);
++    if (prop)
++        drmModeFreeProperty(prop);
++    drmModeFreePlaneResources(planes);
++    return ret;
++}
++
++static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
++{
++    if (da->fb_handle != 0) {
++        drmModeRmFB(de->drm_fd, da->fb_handle);
++        da->fb_handle = 0;
++    }
++
++    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++        if (da->bo_handles[i]) {
++            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
++            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
++            da->bo_handles[i] = 0;
++        }
++    }
++    av_frame_free(&da->frame);
++}
++
++static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
++{
++    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
++    drm_aux_t * da = de->aux + de->ano;
++    const uint32_t format = desc->layers[0].format;
++    int ret = 0;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
++#endif
++
++    if (de->setup.out_fourcc != format) {
++        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
++            av_frame_free(&frame);
++            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
++            return -1;
++        }
++        de->setup.out_fourcc = format;
++    }
++
++    {
++        drmVBlank vbl = {
++            .request = {
++                .type = DRM_VBLANK_RELATIVE,
++                .sequence = 0
++            }
++        };
++
++        while (drmWaitVBlank(de->drm_fd, &vbl)) {
++            if (errno != EINTR) {
++//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
++                break;
++            }
++        }
++    }
++
++    da_uninit(de, da);
++
++    {
++        uint32_t pitches[4] = {0};
++        uint32_t offsets[4] = {0};
++        uint64_t modifiers[4] = {0};
++        uint32_t bo_handles[4] = {0};
++        int has_mods = 0;
++        int i, j, n;
++
++        da->frame = frame;
++
++        for (i = 0; i < desc->nb_objects; ++i) {
++            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
++                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
++                return -1;
++            }
++            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
++                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
++                has_mods = 1;
++        }
++
++        n = 0;
++        for (i = 0; i < desc->nb_layers; ++i) {
++            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
++                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
++                pitches[n] = p->pitch;
++                offsets[n] = p->offset;
++                modifiers[n] = obj->format_modifier;
++                bo_handles[n] = da->bo_handles[p->object_index];
++                ++n;
++            }
++        }
++
++#if 1 && TRACE_ALL
++        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
++               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
++               av_frame_cropped_width(frame),
++               av_frame_cropped_height(frame),
++               desc->layers[0].format,
++               bo_handles[0],
++               bo_handles[1],
++               bo_handles[2],
++               bo_handles[3],
++               pitches[0],
++               pitches[1],
++               pitches[2],
++               pitches[3],
++               offsets[0],
++               offsets[1],
++               offsets[2],
++               offsets[3],
++               (long long)modifiers[0],
++               (long long)modifiers[1],
++               (long long)modifiers[2],
++               (long long)modifiers[3]
++               );
++#endif
++
++        if (drmModeAddFB2WithModifiers(de->drm_fd,
++                                       av_frame_cropped_width(frame),
++                                       av_frame_cropped_height(frame),
++                                       desc->layers[0].format, bo_handles,
++                                       pitches, offsets,
++                                       has_mods ? modifiers : NULL,
++                                       &da->fb_handle,
++                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
++            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
++            return -1;
++        }
++    }
++
++    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
++                              da->fb_handle, 0,
++                de->setup.compose.x, de->setup.compose.y,
++                de->setup.compose.width,
++                de->setup.compose.height,
++                0, 0,
++                av_frame_cropped_width(frame) << 16,
++                av_frame_cropped_height(frame) << 16);
++
++    if (ret != 0) {
++        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
++    }
++
++    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
++
++    return ret;
++}
++
++static int do_sem_wait(sem_t * const sem, const int nowait)
++{
++    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
++        if (errno != EINTR)
++            return -errno;
++    }
++    return 0;
++}
++
++static void * display_thread(void * v)
++{
++    AVFormatContext * const s = v;
++    drm_display_env_t * const de = s->priv_data;
++    int i;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++#endif
++
++    sem_post(&de->q_sem_out);
++
++    for (;;) {
++        AVFrame * frame;
++
++        do_sem_wait(&de->q_sem_in, 0);
++
++        if (de->q_terminate)
++            break;
++
++        frame = de->q_next;
++        de->q_next = NULL;
++        sem_post(&de->q_sem_out);
++
++        do_display(s, de, frame);
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++#endif
++
++    for (i = 0; i != AUX_SIZE; ++i)
++        da_uninit(de, de->aux + i);
++
++    av_frame_free(&de->q_next);
++
++    return NULL;
++}
++
++static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    const AVFrame * const src_frame = (AVFrame *)pkt->data;
++    AVFrame * frame;
++    drm_display_env_t * const de = s->priv_data;
++    int ret;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
++#endif
++
++    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
++        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
++        return 0;
++    }
++
++    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++        frame = av_frame_alloc();
++        av_frame_ref(frame, src_frame);
++    }
++    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++        frame = av_frame_alloc();
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        if (av_hwframe_map(frame, src_frame, 0) != 0)
++        {
++            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++            av_frame_free(&frame);
++            return AVERROR(EINVAL);
++        }
++    }
++    else {
++        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++        return AVERROR(EINVAL);
++    }
++
++    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
++    if (ret) {
++        av_frame_free(&frame);
++    }
++    else {
++        de->q_next = frame;
++        sem_post(&de->q_sem_in);
++    }
++
++    return 0;
++}
++
++static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                          unsigned flags)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++    /* drm_vout_write_header() should have accepted only supported formats */
++    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++        return 0;
++
++    return 0;
++}
++
++static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
++#endif
++    switch(type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
++{
++   int ret = -1;
++   int i;
++   drmModeRes *res = drmModeGetResources(drmfd);
++   drmModeConnector *c;
++
++   if(!res)
++   {
++      printf( "drmModeGetResources failed: %s\n", ERRSTR);
++      return -1;
++   }
++
++   if (res->count_crtcs <= 0)
++   {
++      printf( "drm: no crts\n");
++      goto fail_res;
++   }
++
++   if (!s->conId) {
++      fprintf(stderr,
++         "No connector ID specified.  Choosing default from list:\n");
++
++      for (i = 0; i < res->count_connectors; i++) {
++         drmModeConnector *con =
++            drmModeGetConnector(drmfd, res->connectors[i]);
++         drmModeEncoder *enc = NULL;
++         drmModeCrtc *crtc = NULL;
++
++         if (con->encoder_id) {
++            enc = drmModeGetEncoder(drmfd, con->encoder_id);
++            if (enc->crtc_id) {
++               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
++            }
++         }
++
++         if (!s->conId && crtc) {
++            s->conId = con->connector_id;
++            s->crtcId = crtc->crtc_id;
++         }
++
++         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
++                con->connector_id,
++                crtc ? crtc->crtc_id : 0,
++                con->connector_type,
++                crtc ? crtc->width : 0,
++                crtc ? crtc->height : 0,
++                (s->conId == (int)con->connector_id ?
++            " (chosen)" : ""));
++      }
++
++      if (!s->conId) {
++         av_log(avctx, AV_LOG_ERROR,
++            "No suitable enabled connector found.\n");
++         return -1;;
++      }
++   }
++
++   s->crtcIdx = -1;
++
++   for (i = 0; i < res->count_crtcs; ++i) {
++      if (s->crtcId == res->crtcs[i]) {
++         s->crtcIdx = i;
++         break;
++      }
++   }
++
++   if (s->crtcIdx == -1)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
++       goto fail_res;
++   }
++
++   if (res->count_connectors <= 0)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
++       goto fail_res;
++   }
++
++   c = drmModeGetConnector(drmfd, s->conId);
++   if (!c)
++   {
++       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
++       goto fail_res;
++   }
++
++   if (!c->count_modes)
++   {
++       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
++       goto fail_conn;
++   }
++
++   {
++      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
++      s->compose.x = crtc->x;
++      s->compose.y = crtc->y;
++      s->compose.width = crtc->width;
++      s->compose.height = crtc->height;
++      drmModeFreeCrtc(crtc);
++   }
++
++   if (pConId)
++      *pConId = c->connector_id;
++   ret = 0;
++
++fail_conn:
++   drmModeFreeConnector(c);
++
++fail_res:
++   drmModeFreeResources(res);
++
++   return ret;
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int drm_vout_init(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++    int rv;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->drm_fd = -1;
++    de->con_id = 0;
++    de->setup = (struct drm_setup){0};
++    de->q_terminate = 0;
++
++    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
++    {
++        rv = AVERROR(errno);
++        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
++        return rv;
++    }
++
++    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
++    {
++        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
++        rv = AVERROR(EINVAL);
++        goto fail_close;
++    }
++
++    sem_init(&de->q_sem_in, 0, 0);
++    sem_init(&de->q_sem_out, 0, 0);
++    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
++        rv = AVERROR(errno);
++        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
++        goto fail_close;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++    return 0;
++
++fail_close:
++    close(de->drm_fd);
++    de->drm_fd = -1;
++    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
++
++    return rv;
++}
++
++static void drm_vout_deinit(struct AVFormatContext * s)
++{
++    drm_display_env_t * const de = s->priv_data;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->q_terminate = 1;
++    sem_post(&de->q_sem_in);
++    pthread_join(de->q_thread, NULL);
++    sem_destroy(&de->q_sem_in);
++    sem_destroy(&de->q_sem_out);
++
++    for (unsigned int i = 0; i != AUX_SIZE; ++i)
++        da_uninit(de, de->aux + i);
++
++    av_frame_free(&de->q_next);
++
++    if (de->drm_fd >= 0) {
++        close(de->drm_fd);
++        de->drm_fd = -1;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++
++#define OFFSET(x) offsetof(drm_display_env_t, x)
++static const AVOption options[] = {
++    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++};
++
++static const AVClass drm_vout_class = {
++    .class_name = "drm vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_drm_muxer = {
++    .name           = "vout_drm",
++    .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
++    .priv_data_size = sizeof(drm_display_env_t),
++    .audio_codec    = AV_CODEC_ID_NONE,
++    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++    .write_header   = drm_vout_write_header,
++    .write_packet   = drm_vout_write_packet,
++    .write_uncoded_frame = drm_vout_write_frame,
++    .write_trailer  = drm_vout_write_trailer,
++    .control_message = drm_vout_control_message,
++    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++    .priv_class     = &drm_vout_class,
++    .init           = drm_vout_init,
++    .deinit         = drm_vout_deinit,
++};
++
+diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
+new file mode 100644
+index 0000000000..cc6e310551
+--- /dev/null
++++ b/libavdevice/egl_vout.c
+@@ -0,0 +1,788 @@
++/*
++ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++// *** This module is a work in progress and its utility is strictly
++//     limited to testing.
++//     Amongst other issues it doesn't wait for the pic to be displayed before
++//     returning the buffer so flikering does occur.
++
++#include <epoxy/gl.h>
++#include <epoxy/egl.h>
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include "pthread.h"
++#include <semaphore.h>
++#include <stdatomic.h>
++#include <unistd.h>
++
++#include <X11/Xlib.h>
++#include <X11/Xutil.h>
++
++#include "libavutil/rpi_sand_fns.h"
++
++#define TRACE_ALL 0
++
++struct egl_setup {
++    int conId;
++
++    Display *dpy;
++    EGLDisplay egl_dpy;
++    EGLContext ctx;
++    EGLSurface surf;
++    Window win;
++
++    uint32_t crtcId;
++    int crtcIdx;
++    uint32_t planeId;
++    struct {
++        int x, y, width, height;
++    } compose;
++};
++
++typedef struct egl_aux_s {
++    int fd;
++    GLuint texture;
++
++} egl_aux_t;
++
++typedef struct egl_display_env_s {
++    AVClass *class;
++
++    struct egl_setup setup;
++    enum AVPixelFormat avfmt;
++
++    int show_all;
++    int window_width, window_height;
++    int window_x, window_y;
++    int fullscreen;
++
++    egl_aux_t aux[32];
++
++    pthread_t q_thread;
++    pthread_mutex_t q_lock;
++    sem_t display_start_sem;
++    sem_t q_sem;
++    int q_terminate;
++    AVFrame *q_this;
++    AVFrame *q_next;
++
++} egl_display_env_t;
++
++
++/**
++ * Remove window border/decorations.
++ */
++static void
++no_border(Display *dpy, Window w)
++{
++    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
++    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
++
++    typedef struct {
++        unsigned long       flags;
++        unsigned long       functions;
++        unsigned long       decorations;
++        long                inputMode;
++        unsigned long       status;
++    } PropMotifWmHints;
++
++    PropMotifWmHints motif_hints;
++    Atom prop, proptype;
++    unsigned long flags = 0;
++
++    /* setup the property */
++    motif_hints.flags = MWM_HINTS_DECORATIONS;
++    motif_hints.decorations = flags;
++
++    /* get the atom for the property */
++    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
++    if (!prop) {
++        /* something went wrong! */
++        return;
++    }
++
++    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
++    proptype = prop;
++
++    XChangeProperty(dpy, w,                         /* display, window */
++                    prop, proptype,                 /* property, type */
++                    32,                             /* format: 32-bit datums */
++                    PropModeReplace,                /* mode */
++                    (unsigned char *)&motif_hints, /* data */
++                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
++                   );
++}
++
++
++/*
++ * Create an RGB, double-buffered window.
++ * Return the window and context handles.
++ */
++static int
++make_window(struct AVFormatContext *const s,
++            egl_display_env_t *const de,
++            Display *dpy, EGLDisplay egl_dpy, const char *name,
++            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
++{
++    int scrnum = DefaultScreen(dpy);
++    XSetWindowAttributes attr;
++    unsigned long mask;
++    Window root = RootWindow(dpy, scrnum);
++    Window win;
++    EGLContext ctx;
++    const int fullscreen = de->fullscreen;
++    EGLConfig config;
++    int x = de->window_x;
++    int y = de->window_y;
++    int width = de->window_width ? de->window_width : 1280;
++    int height = de->window_height ? de->window_height : 720;
++
++
++    if (fullscreen) {
++        int scrnum = DefaultScreen(dpy);
++
++        x = 0; y = 0;
++        width = DisplayWidth(dpy, scrnum);
++        height = DisplayHeight(dpy, scrnum);
++    }
++
++    {
++        EGLint num_configs;
++        static const EGLint attribs[] = {
++            EGL_RED_SIZE, 1,
++            EGL_GREEN_SIZE, 1,
++            EGL_BLUE_SIZE, 1,
++            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
++            EGL_NONE
++        };
++
++        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
++            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
++            return -1;
++        }
++    }
++
++    {
++        EGLint vid;
++        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
++            return -1;
++        }
++
++        {
++            XVisualInfo visTemplate = {
++                .visualid = vid,
++            };
++            int num_visuals;
++            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
++                                                  &visTemplate, &num_visuals);
++
++            /* window attributes */
++            attr.background_pixel = 0;
++            attr.border_pixel = 0;
++            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
++            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
++            /* XXX this is a bad way to get a borderless window! */
++            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
++
++            win = XCreateWindow(dpy, root, x, y, width, height,
++                                0, visinfo->depth, InputOutput,
++                                visinfo->visual, mask, &attr);
++            XFree(visinfo);
++        }
++    }
++
++    if (fullscreen)
++        no_border(dpy, win);
++
++    /* set hints and properties */
++    {
++        XSizeHints sizehints;
++        sizehints.x = x;
++        sizehints.y = y;
++        sizehints.width  = width;
++        sizehints.height = height;
++        sizehints.flags = USSize | USPosition;
++        XSetNormalHints(dpy, win, &sizehints);
++        XSetStandardProperties(dpy, win, name, name,
++                               None, (char **)NULL, 0, &sizehints);
++    }
++
++    eglBindAPI(EGL_OPENGL_ES_API);
++
++    {
++        static const EGLint ctx_attribs[] = {
++            EGL_CONTEXT_CLIENT_VERSION, 2,
++            EGL_NONE
++        };
++        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
++        if (!ctx) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++            return -1;
++        }
++    }
++
++
++    XMapWindow(dpy, win);
++
++    {
++        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
++        if (!surf) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
++            return -1;
++        }
++
++        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
++            return -1;
++        }
++
++        *winRet = win;
++        *ctxRet = ctx;
++        *surfRet = surf;
++    }
++
++    return 0;
++}
++
++static GLint
++compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
++{
++    GLuint s = glCreateShader(target);
++
++    if (s == 0) {
++        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
++        return 0;
++    }
++
++    glShaderSource(s, 1, (const GLchar **)&source, NULL);
++    glCompileShader(s);
++
++    {
++        GLint ok;
++        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
++
++        if (!ok) {
++            GLchar *info;
++            GLint size;
++
++            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
++            info = malloc(size);
++
++            glGetShaderInfoLog(s, size, NULL, info);
++            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
++
++            return 0;
++        }
++    }
++
++    return s;
++}
++
++static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
++{
++    GLuint prog = glCreateProgram();
++
++    if (prog == 0) {
++        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
++        return 0;
++    }
++
++    glAttachShader(prog, vs);
++    glAttachShader(prog, fs);
++    glLinkProgram(prog);
++
++    {
++        GLint ok;
++        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
++        if (!ok) {
++            /* Some drivers return a size of 1 for an empty log.  This is the size
++             * of a log that contains only a terminating NUL character.
++             */
++            GLint size;
++            GLchar *info = NULL;
++            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
++            if (size > 1) {
++                info = malloc(size);
++                glGetProgramInfoLog(prog, size, NULL, info);
++            }
++
++            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
++                   (info != NULL) ? info : "<empty log>");
++            return 0;
++        }
++    }
++
++    return prog;
++}
++
++static int
++gl_setup(struct AVFormatContext *const s)
++{
++    const char *vs =
++        "attribute vec4 pos;\n"
++        "varying vec2 texcoord;\n"
++        "\n"
++        "void main() {\n"
++        "  gl_Position = pos;\n"
++        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
++        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
++        "}\n";
++    const char *fs =
++        "#extension GL_OES_EGL_image_external : enable\n"
++        "precision mediump float;\n"
++        "uniform samplerExternalOES s;\n"
++        "varying vec2 texcoord;\n"
++        "void main() {\n"
++        "  gl_FragColor = texture2D(s, texcoord);\n"
++        "}\n";
++
++    GLuint vs_s;
++    GLuint fs_s;
++    GLuint prog;
++
++    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
++        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
++        !(prog = link_program(s, vs_s, fs_s)))
++        return -1;
++
++    glUseProgram(prog);
++
++    {
++        static const float verts[] = {
++            -1, -1,
++            1, -1,
++            1,  1,
++            -1,  1,
++        };
++        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
++    }
++
++    glEnableVertexAttribArray(0);
++    return 0;
++}
++
++static int egl_vout_write_trailer(AVFormatContext *s)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    return 0;
++}
++
++static int egl_vout_write_header(AVFormatContext *s)
++{
++    const AVCodecParameters *const par = s->streams[0]->codecpar;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    if (s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++
++static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
++{
++    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
++    egl_aux_t *da = NULL;
++    unsigned int i;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++
++    for (i = 0; i != 32; ++i) {
++        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
++            da = de->aux + i;
++            break;
++        }
++    }
++
++    if (da == NULL) {
++        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
++        return AVERROR(EINVAL);
++    }
++
++    if (da->texture == 0) {
++        EGLint attribs[50];
++        EGLint *a = attribs;
++        int i, j;
++        static const EGLint anames[] = {
++            EGL_DMA_BUF_PLANE0_FD_EXT,
++            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE0_PITCH_EXT,
++            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
++            EGL_DMA_BUF_PLANE1_FD_EXT,
++            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE1_PITCH_EXT,
++            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
++            EGL_DMA_BUF_PLANE2_FD_EXT,
++            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
++            EGL_DMA_BUF_PLANE2_PITCH_EXT,
++            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
++            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
++        };
++        const EGLint *b = anames;
++
++        *a++ = EGL_WIDTH;
++        *a++ = av_frame_cropped_width(frame);
++        *a++ = EGL_HEIGHT;
++        *a++ = av_frame_cropped_height(frame);
++        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
++        *a++ = desc->layers[0].format;
++
++        for (i = 0; i < desc->nb_layers; ++i) {
++            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
++                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
++                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
++                *a++ = *b++;
++                *a++ = obj->fd;
++                *a++ = *b++;
++                *a++ = p->offset;
++                *a++ = *b++;
++                *a++ = p->pitch;
++                if (obj->format_modifier == 0) {
++                    b += 2;
++                }
++                else {
++                    *a++ = *b++;
++                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
++                    *a++ = *b++;
++                    *a++ = (EGLint)(obj->format_modifier >> 32);
++                }
++            }
++        }
++
++        *a = EGL_NONE;
++
++#if TRACE_ALL
++        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
++            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
++        }
++#endif
++        {
++            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
++                                                     EGL_NO_CONTEXT,
++                                                     EGL_LINUX_DMA_BUF_EXT,
++                                                     NULL, attribs);
++            if (!image) {
++                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
++                return -1;
++            }
++
++            glGenTextures(1, &da->texture);
++            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
++            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
++
++            eglDestroyImageKHR(de->setup.egl_dpy, image);
++        }
++
++        da->fd = desc->objects[0].fd;
++    }
++
++    glClearColor(0.5, 0.5, 0.5, 0.5);
++    glClear(GL_COLOR_BUFFER_BIT);
++
++    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
++    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
++    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
++
++    glDeleteTextures(1, &da->texture);
++    da->texture = 0;
++    da->fd = -1;
++
++    return 0;
++}
++
++static void* display_thread(void *v)
++{
++    AVFormatContext *const s = v;
++    egl_display_env_t *const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
++#endif
++    {
++        EGLint egl_major, egl_minor;
++
++        de->setup.dpy = XOpenDisplay(NULL);
++        if (!de->setup.dpy) {
++            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
++            goto fail;
++        }
++
++        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
++        if (!de->setup.egl_dpy) {
++            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
++            goto fail;
++        }
++
++        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
++            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
++            goto fail;
++        }
++
++        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
++
++        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
++            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
++            goto fail;
++        }
++    }
++
++    if (!de->window_width || !de->window_height) {
++        de->window_width = 1280;
++        de->window_height = 720;
++    }
++    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
++                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
++        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
++        goto fail;
++    }
++
++    if (gl_setup(s)) {
++        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
++        goto fail;
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
++#endif
++    sem_post(&de->display_start_sem);
++
++    for (;;) {
++        AVFrame *frame;
++
++        while (sem_wait(&de->q_sem) != 0) {
++            av_assert0(errno == EINTR);
++        }
++
++        if (de->q_terminate)
++            break;
++
++        pthread_mutex_lock(&de->q_lock);
++        frame = de->q_next;
++        de->q_next = NULL;
++        pthread_mutex_unlock(&de->q_lock);
++
++        do_display(s, de, frame);
++
++        av_frame_free(&de->q_this);
++        de->q_this = frame;
++    }
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
++#endif
++
++    return NULL;
++
++fail:
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
++#endif
++    de->q_terminate = 1;
++    sem_post(&de->display_start_sem);
++
++    return NULL;
++}
++
++static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    const AVFrame *const src_frame = (AVFrame *)pkt->data;
++    AVFrame *frame;
++    egl_display_env_t *const de = s->priv_data;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++
++    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
++        frame = av_frame_alloc();
++        av_frame_ref(frame, src_frame);
++    }
++    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
++        frame = av_frame_alloc();
++        frame->format = AV_PIX_FMT_DRM_PRIME;
++        if (av_hwframe_map(frame, src_frame, 0) != 0) {
++            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
++            av_frame_free(&frame);
++            return AVERROR(EINVAL);
++        }
++    }
++    else {
++        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
++        return AVERROR(EINVAL);
++    }
++
++    // Really hacky sync
++    while (de->show_all && de->q_next) {
++        usleep(3000);
++    }
++
++    pthread_mutex_lock(&de->q_lock);
++    {
++        AVFrame *const t = de->q_next;
++        de->q_next = frame;
++        frame = t;
++    }
++    pthread_mutex_unlock(&de->q_lock);
++
++    if (frame == NULL)
++        sem_post(&de->q_sem);
++    else
++        av_frame_free(&frame);
++
++    return 0;
++}
++
++static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                                unsigned flags)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++    /* egl_vout_write_header() should have accepted only supported formats */
++    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++        return 0;
++
++    return 0;
++}
++
++static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++    switch (type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int egl_vout_init(struct AVFormatContext *s)
++{
++    egl_display_env_t *const de = s->priv_data;
++    unsigned int i;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->setup = (struct egl_setup) { 0 };
++
++    for (i = 0; i != 32; ++i) {
++        de->aux[i].fd = -1;
++    }
++
++    de->q_terminate = 0;
++    pthread_mutex_init(&de->q_lock, NULL);
++    sem_init(&de->q_sem, 0, 0);
++    sem_init(&de->display_start_sem, 0, 0);
++    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
++
++    sem_wait(&de->display_start_sem);
++    if (de->q_terminate) {
++        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
++        return -1;
++    }
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++
++    return 0;
++}
++
++static void egl_vout_deinit(struct AVFormatContext *s)
++{
++    egl_display_env_t *const de = s->priv_data;
++
++    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
++
++    de->q_terminate = 1;
++    sem_post(&de->q_sem);
++    pthread_join(de->q_thread, NULL);
++    sem_destroy(&de->q_sem);
++    pthread_mutex_destroy(&de->q_lock);
++
++    av_frame_free(&de->q_next);
++    av_frame_free(&de->q_this);
++
++    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
++}
++
++#define OFFSET(x) offsetof(egl_display_env_t, x)
++static const AVOption options[] = {
++    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++
++};
++
++static const AVClass egl_vout_class = {
++    .class_name = "egl vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_egl_muxer = {
++    .name           = "vout_egl",
++    .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
++    .priv_data_size = sizeof(egl_display_env_t),
++    .audio_codec    = AV_CODEC_ID_NONE,
++    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++    .write_header   = egl_vout_write_header,
++    .write_packet   = egl_vout_write_packet,
++    .write_uncoded_frame = egl_vout_write_frame,
++    .write_trailer  = egl_vout_write_trailer,
++    .control_message = egl_vout_control_message,
++    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++    .priv_class     = &egl_vout_class,
++    .init           = egl_vout_init,
++    .deinit         = egl_vout_deinit,
++};
++
+diff --git a/libavdevice/rpi_vout.c b/libavdevice/rpi_vout.c
+new file mode 100644
+index 0000000000..84723a34ad
+--- /dev/null
++++ b/libavdevice/rpi_vout.c
+@@ -0,0 +1,534 @@
++/*
++ * Copyright (c) 2013 Jeff Moguillansky
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * XVideo output device
++ *
++ * TODO:
++ * - add support to more formats
++ */
++
++#include "libavutil/opt.h"
++#include "libavutil/avassert.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/imgutils.h"
++#include "libavformat/internal.h"
++#include "avdevice.h"
++
++#include <stdatomic.h>
++#include <unistd.h>
++
++#pragma GCC diagnostic push
++// Many many redundant decls in the header files
++#pragma GCC diagnostic ignored "-Wredundant-decls"
++#include <bcm_host.h>
++#include <interface/mmal/mmal.h>
++#include <interface/mmal/mmal_parameters_camera.h>
++#include <interface/mmal/mmal_buffer.h>
++#include <interface/mmal/mmal_port.h>
++#include <interface/mmal/util/mmal_util.h>
++#include <interface/mmal/util/mmal_default_components.h>
++#include <interface/mmal/util/mmal_connection.h>
++#include <interface/mmal/util/mmal_util_params.h>
++#pragma GCC diagnostic pop
++#include "libavutil/rpi_sand_fns.h"
++#include "libavcodec/rpi_zc.h"
++
++#define TRACE_ALL 0
++
++#define DISPLAY_PORT_DEPTH 4
++
++typedef struct rpi_display_env_s
++{
++    AVClass *class;
++
++    MMAL_COMPONENT_T* display;
++    MMAL_COMPONENT_T* isp;
++    MMAL_PORT_T * port_in;  // Input port of either isp or display depending on pipe setup
++    MMAL_CONNECTION_T * conn;
++
++    MMAL_POOL_T *rpi_pool;
++    volatile int rpi_display_count;
++
++    MMAL_FOURCC_T req_fmt;
++    MMAL_VIDEO_FORMAT_T req_vfmt;
++
++    AVZcEnvPtr zc;
++
++    int window_width, window_height;
++    int window_x, window_y;
++    int layer, fullscreen;
++    int show_all;
++} rpi_display_env_t;
++
++
++static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
++    mmal_buffer_header_release(buffer);
++}
++
++static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
++    mmal_buffer_header_release(buffer);
++}
++
++
++static MMAL_FOURCC_T mmfmt_from_avfmt(const enum AVPixelFormat fmt)
++{
++    switch (fmt) {
++    case AV_PIX_FMT_SAND128:
++    case AV_PIX_FMT_RPI4_8:
++        return MMAL_ENCODING_YUVUV128;
++    case AV_PIX_FMT_RPI4_10:
++        return MMAL_ENCODING_YUV10_COL;
++    case AV_PIX_FMT_SAND64_10:
++        return MMAL_ENCODING_YUVUV64_10;
++    case AV_PIX_FMT_SAND64_16:
++        return MMAL_ENCODING_YUVUV64_16;
++    case AV_PIX_FMT_YUV420P:
++        return MMAL_ENCODING_I420;
++
++    default:
++        break;
++    }
++    return 0;
++}
++
++
++static void video_format_from_zc_frame(MMAL_ES_FORMAT_T* const es_fmt,
++                                       const AVFrame * const frame, const AVRpiZcRefPtr fr_ref)
++{
++    MMAL_VIDEO_FORMAT_T *const vfmt = &es_fmt->es->video;
++    const AVRpiZcFrameGeometry * geo = av_rpi_zc_geometry(fr_ref);
++    if (av_rpi_is_sand_format(geo->format)) {
++        // Sand formats are a bit "special"
++        // stride1 implicit in format
++        // width = stride2
++        vfmt->width = geo->stripe_is_yc ?
++            geo->height_y + geo->height_c : geo->height_y;
++//        es->height = geo->video_height;  //*** When we get the FLAG this will change
++        vfmt->height = geo->height_y;
++        es_fmt->flags = MMAL_ES_FORMAT_FLAG_COL_FMTS_WIDTH_IS_COL_STRIDE;
++    }
++    else {
++        vfmt->width = geo->stride_y / geo->bytes_per_pel;
++        vfmt->height = geo->height_y;
++        es_fmt->flags = 0;
++    }
++
++    es_fmt->type = MMAL_ES_TYPE_VIDEO;
++    es_fmt->encoding = mmfmt_from_avfmt(geo->format);
++    es_fmt->encoding_variant = 0;
++    es_fmt->bitrate = 0;
++
++    vfmt->crop.x = frame->crop_left;
++    vfmt->crop.y = frame->crop_top;
++    vfmt->crop.width = av_frame_cropped_width(frame);
++    vfmt->crop.height = av_frame_cropped_height(frame);
++
++    vfmt->frame_rate.den = 0;  // Don't think I know it here
++    vfmt->frame_rate.num = 0;
++
++    vfmt->par.den = frame->sample_aspect_ratio.den;
++    vfmt->par.num = frame->sample_aspect_ratio.num;
++
++    vfmt->color_space = 0;  // Unknown currently
++}
++
++static MMAL_BOOL_T buf_release_cb(MMAL_BUFFER_HEADER_T * buf, void *userdata)
++{
++    rpi_display_env_t * const de = userdata;
++    if (buf->user_data != NULL) {
++        av_rpi_zc_unref((AVRpiZcRefPtr)buf->user_data);
++        buf->user_data = NULL;
++    }
++    atomic_fetch_add(&de->rpi_display_count, -1);
++    return MMAL_FALSE;
++}
++
++static inline int avfmt_needs_isp(const enum AVPixelFormat avfmt)
++{
++    return avfmt == AV_PIX_FMT_SAND64_10;
++}
++
++static void isp_remove(AVFormatContext * const s, rpi_display_env_t * const de)
++{
++    if (de->isp != NULL)
++    {
++        if (de->isp->input[0]->is_enabled)
++            mmal_port_disable(de->isp->input[0]);
++        if (de->isp->control->is_enabled)
++            mmal_port_disable(de->isp->control);
++    }
++    if (de->conn != NULL) {
++        mmal_connection_destroy(de->conn);
++        de->conn = NULL;
++    }
++    if (de->isp != NULL) {
++        mmal_component_destroy(de->isp);
++        de->isp = NULL;
++    }
++}
++
++static void display_frame(AVFormatContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
++{
++    MMAL_BUFFER_HEADER_T* buf = NULL;
++    AVRpiZcRefPtr fr_buf = NULL;
++
++    if (de == NULL)
++        return;
++
++    if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++        av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
++        return;
++    }
++
++    if ((fr_buf = av_rpi_zc_ref(s, de->zc, fr, fr->format, 1)) == NULL) {
++        return;
++    }
++
++    buf = mmal_queue_get(de->rpi_pool->queue);
++    if (!buf) {
++        // Running too fast so drop the frame (unexpected)
++        goto fail;
++    }
++
++    buf->cmd = 0;
++    buf->offset = 0;
++    buf->flags = 0;
++    mmal_buffer_header_reset(buf);
++
++    atomic_fetch_add(&de->rpi_display_count, 1);  // Deced on release
++    mmal_buffer_header_pre_release_cb_set(buf, buf_release_cb, de);
++
++    buf->user_data = fr_buf;
++    buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf);  // Cast our handle to a pointer for mmal
++    buf->offset = av_rpi_zc_offset(fr_buf);
++    buf->length = av_rpi_zc_length(fr_buf);
++    buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
++
++    while (de->show_all && atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
++        usleep(5000);
++    }
++
++    {
++        MMAL_ES_SPECIFIC_FORMAT_T new_ess = {.video = {0}};
++        MMAL_ES_FORMAT_T new_es = {.es = &new_ess};
++		MMAL_VIDEO_FORMAT_T * const new_vfmt = &new_ess.video;
++
++        video_format_from_zc_frame(&new_es, fr, fr_buf);
++        if (de->req_fmt != new_es.encoding ||
++            de->req_vfmt.width       != new_vfmt->width ||
++            de->req_vfmt.height      != new_vfmt->height ||
++            de->req_vfmt.crop.x      != new_vfmt->crop.x ||
++            de->req_vfmt.crop.y      != new_vfmt->crop.y ||
++            de->req_vfmt.crop.width  != new_vfmt->crop.width ||
++            de->req_vfmt.crop.height != new_vfmt->crop.height) {
++            // Something has changed
++
++            // If we have an ISP tear it down
++            isp_remove(s, de);
++            de->port_in = de->display->input[0];
++
++            // If we still need an ISP create it now
++            if (avfmt_needs_isp(fr->format))
++            {
++                if (mmal_component_create("vc.ril.isp", &de->isp) != MMAL_SUCCESS)
++                {
++                    av_log(s, AV_LOG_ERROR, "ISP creation failed\n");
++                    goto fail;
++                }
++                de->port_in = de->isp->input[0];
++            }
++
++            mmal_format_copy(de->port_in->format, &new_es);
++
++            if (mmal_port_format_commit(de->port_in)) {
++                av_log(s, AV_LOG_ERROR, "Failed to commit input format\n");
++                goto fail;
++            }
++
++            // If we have an ISP then we must want to use it
++            if (de->isp != NULL) {
++                MMAL_PORT_T * const port_out = de->isp->output[0];
++                MMAL_VIDEO_FORMAT_T* vfmt_in = &de->port_in->format->es->video;
++                MMAL_VIDEO_FORMAT_T* vfmt_out = &port_out->format->es->video;
++
++                port_out->format->type = MMAL_ES_TYPE_VIDEO;
++                port_out->format->encoding  = MMAL_ENCODING_YUVUV128;
++                port_out->format->encoding_variant = 0;
++                port_out->format->bitrate = 0;
++                port_out->format->flags = 0;
++                port_out->format->extradata = NULL;
++                port_out->format->extradata_size = 0;
++
++                vfmt_out->width       = (vfmt_in->crop.width + 31) & ~31;
++                vfmt_out->height      = (vfmt_in->crop.height + 15) & ~15;
++                vfmt_out->crop.x      = 0;
++                vfmt_out->crop.y      = 0;
++                vfmt_out->crop.width  = vfmt_in->crop.width;
++                vfmt_out->crop.height = vfmt_in->crop.height;
++                vfmt_out->frame_rate  = vfmt_in->frame_rate;
++                vfmt_out->par         = vfmt_in->par;
++                vfmt_out->color_space = vfmt_in->color_space;
++
++                if (mmal_port_format_commit(port_out)) {
++                    av_log(s, AV_LOG_ERROR, "Failed to commit output format\n");
++                    goto fail;
++                }
++
++                if (mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to create connection\n");
++                    goto fail;
++                }
++                if (mmal_connection_enable(de->conn) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to enable connection\n");
++                    goto fail;
++                }
++                mmal_port_enable(de->isp->control,display_cb_control);
++                mmal_component_enable(de->isp);
++            }
++
++            // Number of slots in my port Q
++            de->port_in->buffer_num = DISPLAY_PORT_DEPTH;
++            // Size to keep it happy - isn't used for anything other than error checking
++            de->port_in->buffer_size = buf->alloc_size;
++            if (!de->port_in->is_enabled)
++            {
++                mmal_port_parameter_set_boolean(de->port_in, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle?  Would have expected a vc_image?
++                if (mmal_port_enable(de->port_in, display_cb_input) != MMAL_SUCCESS) {
++                    av_log(s, AV_LOG_ERROR, "Failed to enable input port\n");
++                    goto fail;
++                }
++            }
++
++            de->req_fmt  = new_es.encoding;
++            de->req_vfmt = *new_vfmt;
++        }
++    }
++
++    if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
++    {
++        av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
++        goto fail;
++    }
++    return;
++
++fail:
++    // If we have a buf then fr_buf is held by that
++    if (buf != NULL)
++        mmal_buffer_header_release(buf);
++    else if (fr_buf != NULL)
++        av_rpi_zc_unref(fr_buf);
++}
++
++
++static int xv_write_trailer(AVFormatContext *s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    if (de->port_in != NULL && de->port_in->is_enabled) {
++        mmal_port_disable(de->port_in);
++    }
++
++    // The above disable should kick out all buffers - check that
++    if (atomic_load(&de->rpi_display_count) != 0) {
++        av_log(s, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
++    }
++
++    isp_remove(s, de);
++    if (de->rpi_pool != NULL) {
++        mmal_pool_destroy(de->rpi_pool);
++        de->rpi_pool = NULL;
++    }
++    if (de->display != NULL) {
++        mmal_component_destroy(de->display);
++        de->display = NULL;
++    }
++
++    return 0;
++}
++
++static int xv_write_header(AVFormatContext *s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++    const AVCodecParameters * const par = s->streams[0]->codecpar;
++    const unsigned int w = de->window_width ? de->window_width : par->width;
++    const unsigned int h = de->window_height ? de->window_height : par->height;
++    const unsigned int x = de->window_x;
++    const unsigned int y = de->window_y;
++    const int layer = de->layer ? de->layer : 2;
++    const MMAL_BOOL_T fullscreen = de->fullscreen;
++
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %dx%d\n", __func__, w, h);
++#endif
++    if (   s->nb_streams > 1
++        || par->codec_type != AVMEDIA_TYPE_VIDEO
++        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
++        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
++        return AVERROR(EINVAL);
++    }
++
++    {
++        MMAL_DISPLAYREGION_T region =
++        {
++            .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
++            .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN |
++                MMAL_DISPLAY_SET_DEST_RECT | MMAL_DISPLAY_SET_ALPHA,
++            .layer = layer,
++            .fullscreen = fullscreen,
++            .dest_rect = {x, y, w, h},
++            .alpha = !fullscreen ? 0xff : 0xff | MMAL_DISPLAY_ALPHA_FLAGS_DISCARD_LOWER_LAYERS,
++        };
++
++        bcm_host_init();  // Needs to be done by someone...
++
++        if (mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to create display component\n");
++            goto fail;
++        }
++        de->port_in = de->display->input[0];
++
++        mmal_port_parameter_set(de->display->input[0], &region.hdr);
++
++        if (mmal_component_enable(de->display) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to enable display component\n");
++            goto fail;
++        }
++        if (mmal_port_enable(de->display->control,display_cb_control) != MMAL_SUCCESS)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to enable display control port\n");
++            goto fail;
++        }
++
++        if ((de->rpi_pool = mmal_pool_create(DISPLAY_PORT_DEPTH, 0)) == NULL)
++        {
++            av_log(s, AV_LOG_ERROR, "Failed to create pool\n");
++            goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    xv_write_trailer(s);
++    return AVERROR_UNKNOWN;
++}
++
++static int xv_write_packet(AVFormatContext *s, AVPacket *pkt)
++{
++    AVFrame * const frame = (AVFrame *)pkt->data;
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s\n", __func__);
++#endif
++    display_frame(s, s->priv_data, frame);
++    return 0;
++}
++
++static int xv_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
++                          unsigned flags)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: idx=%d, flags=%#x\n", __func__, stream_index, flags);
++#endif
++
++    /* xv_write_header() should have accepted only supported formats */
++    if ((flags & AV_WRITE_UNCODED_FRAME_QUERY))
++        return 0;
++//    return write_picture(s, (*frame)->data, (*frame)->linesize);
++
++    display_frame(s, s->priv_data, *ppframe);
++    return 0;
++}
++
++static int xv_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
++{
++#if TRACE_ALL
++    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
++#endif
++    switch(type) {
++    case AV_APP_TO_DEV_WINDOW_REPAINT:
++        return 0;
++    default:
++        break;
++    }
++    return AVERROR(ENOSYS);
++}
++
++// deinit is called if init fails so no need to clean up explicity here
++static int rpi_vout_init(struct AVFormatContext * s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++
++    // Get a ZC context in case we need one - has little overhead if unused
++    if ((de->zc = av_rpi_zc_int_env_alloc(s)) == NULL)
++        return 1;
++
++    return 0;
++}
++
++static void rpi_vout_deinit(struct AVFormatContext * s)
++{
++    rpi_display_env_t * const de = s->priv_data;
++
++    av_rpi_zc_int_env_freep(&de->zc);
++}
++
++
++#define OFFSET(x) offsetof(rpi_display_env_t, x)
++static const AVOption options[] = {
++    { "show_all",     "show all frames",        OFFSET(show_all),     AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "display_layer","set display layer",      OFFSET(layer),        AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
++    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
++    { NULL }
++
++};
++
++static const AVClass xv_class = {
++    .class_name = "rpi vid outdev",
++    .item_name  = av_default_item_name,
++    .option     = options,
++    .version    = LIBAVUTIL_VERSION_INT,
++    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
++};
++
++AVOutputFormat ff_vout_rpi_muxer = {
++    .name           = "vout_rpi",
++    .long_name      = NULL_IF_CONFIG_SMALL("Rpi (mmal) video output device"),
++    .priv_data_size = sizeof(rpi_display_env_t),
++    .audio_codec    = AV_CODEC_ID_NONE,
++    .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
++    .write_header   = xv_write_header,
++    .write_packet   = xv_write_packet,
++    .write_uncoded_frame = xv_write_frame,
++    .write_trailer  = xv_write_trailer,
++    .control_message = xv_control_message,
++    .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
++    .priv_class     = &xv_class,
++    .init           = rpi_vout_init,
++    .deinit         = rpi_vout_deinit,
++};
+diff --git a/libavfilter/Makefile b/libavfilter/Makefile
+index b2c254ea67..144fbda652 100644
+--- a/libavfilter/Makefile
++++ b/libavfilter/Makefile
+@@ -233,6 +233,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
+ OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
+ OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_deinterlace_qsv.o
+ OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
++OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
+ OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
+ OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
+ OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
+@@ -459,6 +460,7 @@ OBJS-$(CONFIG_TRANSPOSE_OPENCL_FILTER)       += vf_transpose_opencl.o opencl.o o
+ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
+ OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
+ OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
++OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
+ OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
+ OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
+                                                 opencl/unsharp.o
+diff --git a/libavfilter/aarch64/Makefile b/libavfilter/aarch64/Makefile
+index b58daa3a3f..b68209bc94 100644
+--- a/libavfilter/aarch64/Makefile
++++ b/libavfilter/aarch64/Makefile
+@@ -1,3 +1,5 @@
++OBJS-$(CONFIG_BWDIF_FILTER)                  += aarch64/vf_bwdif_init_aarch64.o
+ OBJS-$(CONFIG_NLMEANS_FILTER)                += aarch64/vf_nlmeans_init.o
+ 
++NEON-OBJS-$(CONFIG_BWDIF_FILTER)             += aarch64/vf_bwdif_neon.o
+ NEON-OBJS-$(CONFIG_NLMEANS_FILTER)           += aarch64/vf_nlmeans_neon.o
+diff --git a/libavfilter/aarch64/vf_bwdif_init_aarch64.c b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+new file mode 100644
+index 0000000000..f52bc4b9b4
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_init_aarch64.c
+@@ -0,0 +1,125 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++#include "libavutil/common.h"
++#include "libavfilter/bwdif.h"
++#include "libavutil/aarch64/cpu.h"
++
++void ff_bwdif_filter_edge_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat);
++
++void ff_bwdif_filter_intra_neon(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max);
++
++void ff_bwdif_filter_line_neon(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max);
++
++void ff_bwdif_filter_line3_neon(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max);
++
++
++static void filter_line3_helper(void * dst1, int d_stride,
++                                const void * prev1, const void * cur1, const void * next1, int s_stride,
++                                int w, int parity, int clip_max)
++{
++    // Asm works on 16 byte chunks
++    // If w is a multiple of 16 then all is good - if not then if width rounded
++    // up to nearest 16 will fit in both src & dst strides then allow the asm
++    // to write over the padding bytes as that is almost certainly faster than
++    // having to invoke the C version to clean up the tail.
++    const int w1 = FFALIGN(w, 16);
++    const int w0 = clip_max != 255 ? 0 :
++                   d_stride <= w1 && s_stride <= w1 ? w : w & ~15;
++
++    ff_bwdif_filter_line3_neon(dst1, d_stride,
++                               prev1, cur1, next1, s_stride,
++                               w0, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line3_c((char *)dst1 + w0, d_stride,
++                                (const char *)prev1 + w0, (const char *)cur1 + w0, (const char *)next1 + w0, s_stride,
++                                w - w0, parity, clip_max);
++}
++
++static void filter_line_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int prefs3, int mrefs3, int prefs4, int mrefs4,
++                               int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_line_neon(dst1, prev1, cur1, next1,
++                              w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_line_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2, prefs3, mrefs3, prefs4, mrefs4, parity, clip_max);
++}
++
++static void filter_edge_helper(void *dst1, void *prev1, void *cur1, void *next1,
++                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                               int parity, int clip_max, int spat)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_edge_neon(dst1, prev1, cur1, next1, w0, prefs, mrefs, prefs2, mrefs2,
++                              parity, clip_max, spat);
++
++    if (w0 < w)
++        ff_bwdif_filter_edge_c((char *)dst1 + w0, (char *)prev1 + w0, (char *)cur1 + w0, (char *)next1 + w0,
++                               w - w0, prefs, mrefs, prefs2, mrefs2,
++                               parity, clip_max, spat);
++}
++
++static void filter_intra_helper(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                                int prefs3, int mrefs3, int parity, int clip_max)
++{
++    const int w0 = clip_max != 255 ? 0 : w & ~15;
++
++    ff_bwdif_filter_intra_neon(dst1, cur1, w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++
++    if (w0 < w)
++        ff_bwdif_filter_intra_c((char *)dst1 + w0, (char *)cur1 + w0,
++                                w - w0, prefs, mrefs, prefs3, mrefs3, parity, clip_max);
++}
++
++void
++ff_bwdif_init_aarch64(BWDIFContext *s, int bit_depth)
++{
++    const int cpu_flags = av_get_cpu_flags();
++
++    if (bit_depth != 8)
++        return;
++
++    if (!have_neon(cpu_flags))
++        return;
++
++    s->filter_intra = filter_intra_helper;
++    s->filter_line  = filter_line_helper;
++    s->filter_edge  = filter_edge_helper;
++    s->filter_line3 = filter_line3_helper;
++}
++
+diff --git a/libavfilter/aarch64/vf_bwdif_neon.S b/libavfilter/aarch64/vf_bwdif_neon.S
+new file mode 100644
+index 0000000000..ae9aab20cd
+--- /dev/null
++++ b/libavfilter/aarch64/vf_bwdif_neon.S
+@@ -0,0 +1,788 @@
++/*
++ * bwdif aarch64 NEON optimisations
++ *
++ * Copyright (c) 2023 John Cox <jc@kynesim.co.uk>
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++
++#include "libavutil/aarch64/asm.S"
++
++// Space taken on the stack by an int (32-bit)
++#ifdef __APPLE__
++.set    SP_INT, 4
++#else
++.set    SP_INT, 8
++#endif
++
++.macro SQSHRUNN b, s0, s1, s2, s3, n
++        sqshrun         \s0\().4h, \s0\().4s, #\n - 8
++        sqshrun2        \s0\().8h, \s1\().4s, #\n - 8
++        sqshrun         \s1\().4h, \s2\().4s, #\n - 8
++        sqshrun2        \s1\().8h, \s3\().4s, #\n - 8
++        uzp2            \b\().16b, \s0\().16b, \s1\().16b
++.endm
++
++.macro SMULL4K a0, a1, a2, a3, s0, s1, k
++        smull           \a0\().4s, \s0\().4h, \k
++        smull2          \a1\().4s, \s0\().8h, \k
++        smull           \a2\().4s, \s1\().4h, \k
++        smull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMULL4K a0, a1, a2, a3, s0, s1, k
++        umull           \a0\().4s, \s0\().4h, \k
++        umull2          \a1\().4s, \s0\().8h, \k
++        umull           \a2\().4s, \s1\().4h, \k
++        umull2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLAL4K a0, a1, a2, a3, s0, s1, k
++        umlal           \a0\().4s, \s0\().4h, \k
++        umlal2          \a1\().4s, \s0\().8h, \k
++        umlal           \a2\().4s, \s1\().4h, \k
++        umlal2          \a3\().4s, \s1\().8h, \k
++.endm
++
++.macro UMLSL4K a0, a1, a2, a3, s0, s1, k
++        umlsl           \a0\().4s, \s0\().4h, \k
++        umlsl2          \a1\().4s, \s0\().8h, \k
++        umlsl           \a2\().4s, \s1\().4h, \k
++        umlsl2          \a3\().4s, \s1\().8h, \k
++.endm
++
++//      int b = m2s1 - m1;
++//      int f = p2s1 - p1;
++//      int dc = c0s1 - m1;
++//      int de = c0s1 - p1;
++//      int sp_max = FFMIN(p1 - c0s1, m1 - c0s1);
++//      sp_max = FFMIN(sp_max, FFMAX(-b,-f));
++//      int sp_min = FFMIN(c0s1 - p1, c0s1 - m1);
++//      sp_min = FFMIN(sp_min, FFMAX(b,f));
++//      diff = diff == 0 ? 0 : FFMAX3(diff, sp_min, sp_max);
++.macro SPAT_CHECK diff, m2s1, m1, c0s1, p1, p2s1, t0, t1, t2, t3
++        uqsub           \t0\().16b, \p1\().16b, \c0s1\().16b
++        uqsub           \t2\().16b, \m1\().16b, \c0s1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m1\().16b, \m2s1\().16b
++        uqsub           \t3\().16b, \p1\().16b, \p2s1\().16b
++        umax            \t3\().16b, \t3\().16b, \t1\().16b
++        umin            \t3\().16b, \t3\().16b, \t2\().16b
++
++        uqsub           \t0\().16b, \c0s1\().16b, \p1\().16b
++        uqsub           \t2\().16b, \c0s1\().16b, \m1\().16b
++        umin            \t2\().16b, \t0\().16b, \t2\().16b
++
++        uqsub           \t1\().16b, \m2s1\().16b, \m1\().16b
++        uqsub           \t0\().16b, \p2s1\().16b, \p1\().16b
++        umax            \t0\().16b, \t0\().16b, \t1\().16b
++        umin            \t2\().16b, \t2\().16b, \t0\().16b
++
++        cmeq            \t1\().16b, \diff\().16b, #0
++        umax            \diff\().16b, \diff\().16b, \t3\().16b
++        umax            \diff\().16b, \diff\().16b, \t2\().16b
++        bic             \diff\().16b, \diff\().16b, \t1\().16b
++.endm
++
++//      i0 = s0;
++//      if (i0 > d0 + diff0)
++//          i0 = d0 + diff0;
++//      else if (i0 < d0 - diff0)
++//          i0 = d0 - diff0;
++//
++// i0 = s0 is safe
++.macro DIFF_CLIP i0, s0, d0, diff, t0, t1
++        uqadd           \t0\().16b, \d0\().16b, \diff\().16b
++        uqsub           \t1\().16b, \d0\().16b, \diff\().16b
++        umin            \i0\().16b, \s0\().16b, \t0\().16b
++        umax            \i0\().16b, \i0\().16b, \t1\().16b
++.endm
++
++//      i0 = FFABS(m1 - p1) > td0 ? i1 : i2;
++//      DIFF_CLIP
++//
++// i0 = i1 is safe
++.macro INTERPOL i0, i1, i2, m1, d0, p1, td0, diff, t0, t1, t2
++        uabd            \t0\().16b, \m1\().16b, \p1\().16b
++        cmhi            \t0\().16b, \t0\().16b, \td0\().16b
++        bsl             \t0\().16b, \i1\().16b, \i2\().16b
++        DIFF_CLIP       \i0, \t0, \d0, \diff, \t1, \t2
++.endm
++
++.macro PUSH_VREGS
++        stp             d8,  d9,  [sp, #-64]!
++        stp             d10, d11, [sp, #16]
++        stp             d12, d13, [sp, #32]
++        stp             d14, d15, [sp, #48]
++.endm
++
++.macro POP_VREGS
++        ldp             d14, d15, [sp, #48]
++        ldp             d12, d13, [sp, #32]
++        ldp             d10, d11, [sp, #16]
++        ldp             d8,  d9,  [sp], #64
++.endm
++
++.macro LDR_COEFFS d, t0
++        movrel          \t0, coeffs, 0
++        ld1             {\d\().8h}, [\t0]
++.endm
++
++// static const uint16_t coef_lf[2] = { 4309, 213 };
++// static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
++// static const uint16_t coef_sp[2] = { 5077, 981 };
++
++const coeffs, align=4   // align 4 means align on 2^4 boundry
++        .hword          4309 * 4, 213 * 4               // lf[0]*4 = v0.h[0]
++        .hword          5570, 3801, 1016, -3801         // hf[0] = v0.h[2], -hf[1] = v0.h[5]
++        .hword          5077, 981                       // sp[0] = v0.h[6]
++endconst
++
++// ===========================================================================
++//
++// void ff_bwdif_filter_line3_neon(
++//         void * dst1,         // x0
++//         int d_stride,        // w1
++//         const void * prev1,  // x2
++//         const void * cur1,   // x3
++//         const void * next1,  // x4
++//         int s_stride,        // w5
++//         int w,               // w6
++//         int parity,          // w7
++//         int clip_max);       // [sp, #0] (Ignored)
++
++function ff_bwdif_filter_line3_neon, export=1
++        // Sanity check w
++        cmp             w6, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        cmp             w7, #0
++        csel            x17, x2, x4, ne
++
++        // We want all the V registers - save all the ones we must
++        PUSH_VREGS
++
++        // Some rearrangement of initial values for nice layout of refs in regs
++        mov             w10, w6                         // w10 = loop count
++        neg             w9,  w5                         // w9  = mref
++        lsl             w8,  w9,  #1                    // w8 =  mref2
++        add             w7,  w9,  w9, LSL #1            // w7  = mref3
++        lsl             w6,  w9,  #2                    // w6  = mref4
++        mov             w11, w5                         // w11 = pref
++        lsl             w12, w5,  #1                    // w12 = pref2
++        add             w13, w5,  w5, LSL #1            // w13 = pref3
++        lsl             w14, w5,  #2                    // w14 = pref4
++        add             w15, w5,  w5, LSL #2            // w15 = pref5
++        add             w16, w14, w12                   // w16 = pref6
++
++        lsl             w5,  w1,  #1                    // w5 = d_stride * 2
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];                // c0 = v20, v21
++//             d0  = c0 >> 1;                           // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                    // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];      // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];      // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             j1 = -coef_hf[1] * (c0 + p4);            // j1 = v6-v9  (-c0:v20,v21)
++        add             v20.8h,  v20.8h,  v24.8h
++        add             v21.8h,  v21.8h,  v25.8h
++        SMULL4K         v6, v7, v8, v9, v20, v21, v0.h[5]
++
++//             m3 = cur[mrefs3];                        // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                        // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);            // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);        // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        ldr             q28, [x3, w16, sxtw]
++        ldr             q25, [x17, w16, sxtw]
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];      // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             p6 = prev2[prefs6] + next2[prefs6];      // p6 = v24,v25
++        uaddl           v24.8h,  v25.8b,  v28.8b
++        uaddl2          v25.8h,  v25.16b, v28.16b
++
++//             j1 += coef_hf[2] * (m2 + p6);            // (-p6:v24,v25)
++        add             v24.8h,  v24.8h,  v22.8h
++        add             v25.8h,  v25.8h,  v23.8h
++        UMLAL4K         v6, v7, v8, v9, v24, v25, v0.h[4]
++
++//             m1 = cur[mrefs];                         // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p5 = cur[prefs5];                        // p5 = v25
++        ldr             q25, [x3, w15, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];      // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                           // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             j1 += coef_hf[0] * p2;                   // -
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[2]
++
++//             i1 -= coef_hf[1] * (m2 + p2);            // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                         // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             j1 -= coef_lf[1] * 4 * (m1 + p5);        // -
++        uaddl           v26.8h,  v24.8b,  v25.8b
++        uaddl2          v27.8h,  v24.16b, v25.16b
++        UMLSL4K         v6, v7, v8, v9, v26, v27, v0.h[1]
++
++//             j2 = (coef_sp[0] * (p1 + p3) - coef_sp[1]  * (m1 + p5)) >> 13; // (-p5:v25*) j2=v16
++        uaddl           v18.8h,  v22.8b,  v21.8b
++        uaddl2          v19.8h,  v22.16b, v21.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v24.8b,  v25.8b
++        uaddl2          v19.8h,  v24.16b, v25.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v16, v28, v29, v30, v31, 13
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);        // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++//             j1 += coef_lf[0] * 4 * (p1 + p3);        // p1 = v22, p3 = v21
++        uaddl           v26.8h,  v21.8b,  v22.8b
++        uaddl2          v27.8h,  v21.16b, v22.16b
++        UMLAL4K         v6, v7, v8, v9, v26, v27, v0.h[0]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                               // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             j1 >>= 15;                               // j1 = v3, -v6*, -v7*, -v8*, -v9*
++        SQSHRUNN        v3, v6, v7, v8, v9, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++        ldr             q27, [x2, w13, sxtw]
++        ldr             q26, [x4, w13, sxtw]
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++//             }                                        // v28, v30 preserved for next block
++//             {  // tdiff2 = v14
++//                 int t1 =(FFABS(prev[prefs] - p1) + FFABS(prev[prefs3] - p3)) >> 1;
++//                 int t2 =(FFABS(next[prefs] - p1) + FFABS(next[prefs3] - p3)) >> 1;
++        uabd            v31.16b, v21.16b, v27.16b
++        uabd            v29.16b, v21.16b, v26.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff2 = FFMAX3(temporal_diff2 >> 1, t1, t2); // diff2=v19
++        ushr            v19.16b, v14.16b, #1
++        umax            v19.16b, v19.16b, v31.16b
++        umax            v19.16b, v19.16b, v29.16b
++//             }
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        //  diff2 = v19, d0 = v10, p1 = v22, d2 = v15, p3 = v21, (p4 >> 1) = v12
++        SPAT_CHECK      v19, v10, v22, v15, v21, v12, v31, v30, v29, v28
++
++        // j1 = v3, j2 = v16, p1 = v22, d2 = v15, p3 = v21, td2 = v14, diff2 = v19
++        INTERPOL        v3, v3, v16, v22, v15, v21, v14, v19, v31, v30, v29
++
++//                 dst[d_stride * 2] = av_clip_uint8(interpol);
++        str             q3,  [x0, w5, sxtw]
++
++//             dst[d_stride] = p1;
++        str             q22, [x0, w1, sxtw]
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
++// ===========================================================================
++//
++// void filter_line(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int prefs3,     // [sp, #SP_INT]
++//      int mrefs3,     // [sp, #SP_INT*2]
++//      int prefs4,     // [sp, #SP_INT*3]
++//      int mrefs4,     // [sp, #SP_INT*4]
++//      int parity,     // [sp, #SP_INT*5]
++//      int clip_max)   // [sp, #SP_INT*6]
++
++function ff_bwdif_filter_line_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++        // Rearrange regs to be the same as line3 for ease of debug!
++        mov             w10, w4                         // w10 = loop count
++        mov             w9,  w6                         // w9  = mref
++        mov             w12, w7                         // w12 = pref2
++        mov             w11, w5                         // w11 = pref
++        ldr             w8,  [sp, #0]                   // w8 =  mref2
++        ldr             w7,  [sp, #SP_INT*2]            // w7  = mref3
++        ldr             w6,  [sp, #SP_INT*4]            // w6  = mref4
++        ldr             w13, [sp, #SP_INT]              // w13 = pref3
++        ldr             w14, [sp, #SP_INT*3]            // w14 = pref4
++
++        mov             x4,  x3
++        mov             x3,  x2
++        mov             x2,  x1
++
++        LDR_COEFFS      v0, x17
++
++// #define prev2 cur
++//        const uint8_t * restrict next2 = parity ? prev : next;
++        ldr             w17, [sp, #SP_INT*5]            // parity
++        cmp             w17, #0
++        csel            x17, x2, x4, ne
++
++        PUSH_VREGS
++
++//         for (x = 0; x < w; x++) {
++//             int diff0, diff2;
++//             int d0, d2;
++//             int temporal_diff0, temporal_diff2;
++//
++//             int i1, i2;
++//             int j1, j2;
++//             int p6, p5, p4, p3, p2, p1, c0, m1, m2, m3, m4;
++
++10:
++//             c0 = prev2[0] + next2[0];            // c0 = v20, v21
++//             d0  = c0 >> 1;                       // d0 = v10
++//             temporal_diff0 = FFABS(prev2[0] - next2[0]); // td0 = v11
++        ldr             q31, [x3]
++        ldr             q21, [x17]
++        uhadd           v10.16b, v31.16b, v21.16b
++        uabd            v11.16b, v31.16b, v21.16b
++        uaddl           v20.8h,  v21.8b,  v31.8b
++        uaddl2          v21.8h,  v21.16b, v31.16b
++
++        ldr             q31, [x3, w6, sxtw]
++        ldr             q23, [x17, w6, sxtw]
++
++//             i1 = coef_hf[0] * c0;                // i1 = v2-v5
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[2]
++
++        ldr             q30, [x3, w14, sxtw]
++        ldr             q25, [x17, w14, sxtw]
++
++//             m4 = prev2[mrefs4] + next2[mrefs4];  // m4 = v22,v23
++        uaddl           v22.8h,  v23.8b,  v31.8b
++        uaddl2          v23.8h,  v23.16b, v31.16b
++
++//             p4 = prev2[prefs4] + next2[prefs4];  // p4 = v24,v25, (p4 >> 1) = v12
++        uhadd           v12.16b, v25.16b, v30.16b
++        uaddl           v24.8h,  v25.8b,  v30.8b
++        uaddl2          v25.8h,  v25.16b, v30.16b
++
++//             m3 = cur[mrefs3];                    // m3 = v20
++        ldr             q20, [x3, w7, sxtw]
++
++//             p3 = cur[prefs3];                    // p3 = v21
++        ldr             q21, [x3, w13, sxtw]
++
++//             i1 += coef_hf[2] * (m4 + p4);        // (-m4:v22,v23) (-p4:v24,v25)
++        add             v22.8h,  v22.8h,  v24.8h
++        add             v23.8h,  v23.8h,  v25.8h
++        UMLAL4K         v2, v3, v4, v5, v22, v23, v0.h[4]
++
++        ldr             q29, [x3, w8, sxtw]
++        ldr             q23, [x17, w8, sxtw]
++
++//             i1 -= coef_lf[1] * 4 * (m3 + p3);    // -
++        uaddl           v30.8h,  v20.8b,  v21.8b
++        uaddl2          v31.8h,  v20.16b, v21.16b
++
++        UMLSL4K         v2, v3, v4, v5, v30, v31, v0.h[1]
++
++        ldr             q31, [x3, w12, sxtw]
++        ldr             q27, [x17, w12, sxtw]
++
++//             m2 = prev2[mrefs2] + next2[mrefs2];  // m2 = v22,v23, (m2 >> 1) = v13
++        uhadd           v13.16b, v23.16b, v29.16b
++        uaddl           v22.8h,  v23.8b,  v29.8b
++        uaddl2          v23.8h,  v23.16b, v29.16b
++
++//             m1 = cur[mrefs];                     // m1 = v24
++        ldr             q24, [x3, w9, sxtw]
++
++//             p2 = prev2[prefs2] + next2[prefs2];  // p2 = v26, v27
++//             temporal_diff2 = FFABS(prev2[prefs2] - next2[prefs2]); // td2 = v14
++//             d2  = p2 >> 1;                       // d2 = v15
++        uabd            v14.16b, v31.16b, v27.16b
++        uhadd           v15.16b, v31.16b, v27.16b
++        uaddl           v26.8h,  v27.8b,  v31.8b
++        uaddl2          v27.8h,  v27.16b, v31.16b
++
++//             i1 -= coef_hf[1] * (m2 + p2);        // (-m2:v22,v23*) (-p2:v26*,v27*)
++        add             v22.8h,  v22.8h,  v26.8h
++        add             v23.8h,  v23.8h,  v27.8h
++        UMLSL4K         v2, v3, v4, v5, v22, v23, v0.h[3]
++
++//             p1 = cur[prefs];                     // p1 = v22
++        ldr             q22, [x3, w11, sxtw]
++
++//             i2 = (coef_sp[0] * (m1 + p1) - coef_sp[1] * (m3 + p3)) >> 13; // (-m3:v20*) i2=v17
++        uaddl           v18.8h,  v22.8b,  v24.8b
++        uaddl2          v19.8h,  v22.16b, v24.16b
++        UMULL4K         v28, v29, v30, v31, v18, v19, v0.h[6]
++
++        uaddl           v18.8h,  v20.8b,  v21.8b
++        uaddl2          v19.8h,  v20.16b, v21.16b
++        UMLSL4K         v28, v29, v30, v31, v18, v19, v0.h[7]
++
++        SQSHRUNN        v17, v28, v29, v30, v31, 13
++
++//             i1 += coef_lf[0] * 4 * (m1 + p1);    // p1 = v22, m1 = v24
++        uaddl           v26.8h,  v24.8b,  v22.8b
++        uaddl2          v27.8h,  v24.16b, v22.16b
++        UMLAL4K         v2, v3, v4, v5, v26, v27, v0.h[0]
++
++        ldr             q31, [x2, w9, sxtw]
++        ldr             q29, [x4, w9, sxtw]
++
++        ldr             q30, [x2, w11, sxtw]
++        ldr             q28, [x4, w11, sxtw]
++
++//             i1 >>= 15;                            // i1 = v2, -v3, -v4*, -v5*
++        SQSHRUNN        v2, v2, v3, v4, v5, 15
++
++//             {
++//                 int t1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//                 int t2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++        uabd            v30.16b, v22.16b, v30.16b
++        uabd            v31.16b, v24.16b, v31.16b
++        uabd            v28.16b, v22.16b, v28.16b
++        uabd            v29.16b, v24.16b, v29.16b
++        uhadd           v31.16b, v31.16b, v30.16b
++        uhadd           v29.16b, v29.16b, v28.16b
++
++//                 diff0 = FFMAX3(temporal_diff0 >> 1, t1, t2); // diff0=v18
++        ushr            v18.16b, v11.16b, #1
++        umax            v18.16b, v18.16b, v31.16b
++        umax            v18.16b, v18.16b, v29.16b
++
++        // diff0 = v18, (m2 >> 1) = v13, m1 = v24, d0 = v10, p1 = v22, d2 = v15
++        SPAT_CHECK      v18, v13, v24, v10, v22, v15, v31, v30, v29, v28
++
++        // i1 = v2, i2 = v17, m1 = v24, d0 = v10, p1 = v22, td2 = v11, diff2 = v18
++        INTERPOL        v2, v2, v17, v24, v10, v22, v11, v18, v31, v30, v29
++
++//                 dst[0] = av_clip_uint8(interpol);
++        str             q2,  [x0], #16
++//             }
++//
++//             dst++;
++//             cur++;
++//             prev++;
++//             prev2++;
++//             next++;
++//         }
++
++        subs            w10, w10, #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x4,  x4,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++        POP_VREGS
++99:
++        ret
++endfunc
++
++// ============================================================================
++//
++// void ff_bwdif_filter_edge_neon(
++//      void *dst1,     // x0
++//      void *prev1,    // x1
++//      void *cur1,     // x2
++//      void *next1,    // x3
++//      int w,          // w4
++//      int prefs,      // w5
++//      int mrefs,      // w6
++//      int prefs2,     // w7
++//      int mrefs2,     // [sp, #0]
++//      int parity,     // [sp, #SP_INT]
++//      int clip_max,   // [sp, #SP_INT*2]  unused
++//      int spat);      // [sp, #SP_INT*3]
++
++function ff_bwdif_filter_edge_neon, export=1
++        // Sanity check w
++        cmp             w4, #0
++        ble             99f
++
++// #define prev2 cur
++//     const uint8_t * restrict next2 = parity ? prev : next;
++
++        ldr             w8,  [sp, #0]                   // mrefs2
++
++        ldr             w17, [sp, #SP_INT]              // parity
++        ldr             w16, [sp, #SP_INT*3]            // spat
++        cmp             w17, #0
++        csel            x17, x1, x3, ne
++
++//     for (x = 0; x < w; x++) {
++
++10:
++//        int m1 = cur[mrefs];
++//        int d = (prev2[0] + next2[0]) >> 1;
++//        int p1 = cur[prefs];
++//        int temporal_diff0 = FFABS(prev2[0] - next2[0]);
++//        int temporal_diff1 =(FFABS(prev[mrefs] - m1) + FFABS(prev[prefs] - p1)) >> 1;
++//        int temporal_diff2 =(FFABS(next[mrefs] - m1) + FFABS(next[prefs] - p1)) >> 1;
++//        int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
++        ldr             q31, [x2]
++        ldr             q21, [x17]
++        uhadd           v16.16b, v31.16b, v21.16b       // d0 = v16
++        uabd            v17.16b, v31.16b, v21.16b       // td0 = v17
++        ldr             q24, [x2, w6, sxtw]             // m1 = v24
++        ldr             q22, [x2, w5, sxtw]             // p1 = v22
++
++        ldr             q0,  [x1, w6, sxtw]             // prev[mrefs]
++        ldr             q2,  [x1, w5, sxtw]             // prev[prefs]
++        ldr             q1,  [x3, w6, sxtw]             // next[mrefs]
++        ldr             q3,  [x3, w5, sxtw]             // next[prefs]
++
++        ushr            v29.16b, v17.16b, #1
++
++        uabd            v31.16b, v0.16b,  v24.16b
++        uabd            v30.16b, v2.16b,  v22.16b
++        uhadd           v0.16b,  v31.16b, v30.16b       // td1 = q0
++
++        uabd            v31.16b, v1.16b,  v24.16b
++        uabd            v30.16b, v3.16b,  v22.16b
++        uhadd           v1.16b,  v31.16b, v30.16b       // td2 = q1
++
++        umax            v0.16b,  v0.16b,  v29.16b
++        umax            v0.16b,  v0.16b,  v1.16b        // diff = v0
++
++//        if (spat) {
++//            SPAT_CHECK()
++//        }
++//        i0 = (m1 + p1) >> 1;
++        cbz             w16, 1f
++
++        ldr             q31, [x2,  w8, sxtw]
++        ldr             q18, [x17, w8, sxtw]
++        ldr             q30, [x2,  w7, sxtw]
++        ldr             q19, [x17, w7, sxtw]
++        uhadd           v18.16b, v18.16b, v31.16b
++        uhadd           v19.16b, v19.16b, v30.16b
++
++        SPAT_CHECK      v0, v18, v24, v16, v22, v19, v31, v30, v29, v28
++
++1:
++        uhadd           v2.16b,  v22.16b, v24.16b
++
++        // i0 = v2, s0 = v2, d0 = v16, diff = v0, t0 = v31, t1 = v30
++        DIFF_CLIP       v2, v2, v16, v0, v31, v30
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++        subs            w4,  w4,  #16
++        add             x1,  x1,  #16
++        add             x2,  x2,  #16
++        add             x3,  x3,  #16
++        add             x17, x17, #16
++        bgt             10b
++
++99:
++        ret
++endfunc
++
++// ============================================================================
++//
++// void ff_bwdif_filter_intra_neon(
++//      void *dst1,     // x0
++//      void *cur1,     // x1
++//      int w,          // w2
++//      int prefs,      // w3
++//      int mrefs,      // w4
++//      int prefs3,     // w5
++//      int mrefs3,     // w6
++//      int parity,     // w7       unused
++//      int clip_max)   // [sp, #0] unused
++
++function ff_bwdif_filter_intra_neon, export=1
++        cmp             w2, #0
++        ble             99f
++
++        LDR_COEFFS      v0, x17
++
++//    for (x = 0; x < w; x++) {
++10:
++
++//        interpol = (coef_sp[0] * (cur[mrefs] + cur[prefs]) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13;
++        ldr             q31, [x1, w4, sxtw]
++        ldr             q30, [x1, w3, sxtw]
++        ldr             q29, [x1, w6, sxtw]
++        ldr             q28, [x1, w5, sxtw]
++
++        uaddl           v20.8h,  v31.8b,  v30.8b
++        uaddl2          v21.8h,  v31.16b, v30.16b
++
++        UMULL4K         v2, v3, v4, v5, v20, v21, v0.h[6]
++
++        uaddl           v20.8h,  v29.8b,  v28.8b
++        uaddl2          v21.8h,  v29.16b, v28.16b
++
++        UMLSL4K         v2, v3, v4, v5, v20, v21, v0.h[7]
++
++//        dst[0] = av_clip(interpol, 0, clip_max);
++        SQSHRUNN        v2, v2, v3, v4, v5, 13
++        str             q2, [x0], #16
++
++//        dst++;
++//        cur++;
++//    }
++
++        subs            w2,  w2,  #16
++        add             x1,  x1,  #16
++        bgt             10b
++
++99:
++        ret
++endfunc
+diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
+index 0872c6e0f2..1dd05e4d75 100644
+--- a/libavfilter/allfilters.c
++++ b/libavfilter/allfilters.c
+@@ -218,6 +218,7 @@ extern AVFilter ff_vf_dedot;
+ extern AVFilter ff_vf_deflate;
+ extern AVFilter ff_vf_deflicker;
+ extern AVFilter ff_vf_deinterlace_qsv;
++extern AVFilter ff_vf_deinterlace_v4l2m2m;
+ extern AVFilter ff_vf_deinterlace_vaapi;
+ extern AVFilter ff_vf_dejudder;
+ extern AVFilter ff_vf_delogo;
+@@ -377,6 +378,7 @@ extern AVFilter ff_vf_scale;
+ extern AVFilter ff_vf_scale_cuda;
+ extern AVFilter ff_vf_scale_npp;
+ extern AVFilter ff_vf_scale_qsv;
++extern AVFilter ff_vf_scale_v4l2m2m;
+ extern AVFilter ff_vf_scale_vaapi;
+ extern AVFilter ff_vf_scale_vulkan;
+ extern AVFilter ff_vf_scale2ref;
+@@ -438,6 +440,7 @@ extern AVFilter ff_vf_transpose_opencl;
+ extern AVFilter ff_vf_transpose_vaapi;
+ extern AVFilter ff_vf_trim;
+ extern AVFilter ff_vf_unpremultiply;
++extern AVFilter ff_vf_unsand;
+ extern AVFilter ff_vf_unsharp;
+ extern AVFilter ff_vf_unsharp_opencl;
+ extern AVFilter ff_vf_untile;
+diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
+index f6b572b3de..44fe8b679c 100644
+--- a/libavfilter/avfiltergraph.c
++++ b/libavfilter/avfiltergraph.c
+@@ -32,6 +32,9 @@
+ #include "libavutil/internal.h"
+ #include "libavutil/opt.h"
+ #include "libavutil/pixdesc.h"
++#if CONFIG_UNSAND_FILTER
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ #define FF_INTERNAL_FIELDS 1
+ #include "framequeue.h"
+@@ -422,6 +425,19 @@ static int formats_declared(AVFilterContext *f)
+     return 1;
+ }
+ 
++#if CONFIG_UNSAND_FILTER
++static int has_sand_format(const AVFilterFormats * const ff)
++{
++    int i;
++    for (i = 0; i != ff->nb_formats; ++i) {
++        if (av_rpi_is_sand_format(ff->formats[i])) {
++            return 1;
++        }
++    }
++    return 0;
++}
++#endif
++
+ /**
+  * Perform one round of query_formats() and merging formats lists on the
+  * filter graph.
+@@ -462,6 +478,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+         for (j = 0; j < filter->nb_inputs; j++) {
+             AVFilterLink *link = filter->inputs[j];
+             int convert_needed = 0;
++            unsigned int extra_convert_tried = 0;
+ 
+             if (!link)
+                 continue;
+@@ -504,11 +521,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                            link->outcfg.formats, link->type)
+ #undef MERGE_DISPATCH
+ 
+-            if (convert_needed) {
++            while (convert_needed) {
+                 AVFilterContext *convert;
+                 const AVFilter *filter;
+                 AVFilterLink *inlink, *outlink;
+                 char inst_name[30];
++                int can_retry = 0;
++
++                convert_needed = 0;
+ 
+                 if (graph->disable_auto_convert) {
+                     av_log(log_ctx, AV_LOG_ERROR,
+@@ -521,19 +541,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                 /* couldn't merge format lists. auto-insert conversion filter */
+                 switch (link->type) {
+                 case AVMEDIA_TYPE_VIDEO:
+-                    if (!(filter = avfilter_get_by_name("scale"))) {
+-                        av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
+-                               "not present, cannot convert pixel formats.\n");
+-                        return AVERROR(EINVAL);
+-                    }
+-
+-                    snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
+-                             scaler_count++);
++#if CONFIG_UNSAND_FILTER
++                    // Only try each extra conversion once
++                    // The unsand output pad should never trigger has_sand_format
++                    // but it is better to be safe
++                    if ((extra_convert_tried & 1) == 0 && has_sand_format(link->incfg.formats)) {
++                        if (!(filter = avfilter_get_by_name("unsand"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, "", NULL,
++                                                                graph)) < 0)
++                            return ret;
+ 
+-                    if ((ret = avfilter_graph_create_filter(&convert, filter,
+-                                                            inst_name, graph->scale_sws_opts, NULL,
+-                                                            graph)) < 0)
+-                        return ret;
++                        extra_convert_tried |= 1;
++                        can_retry = 1;
++                    }
++                    else
++#endif
++                    {
++                        if (!(filter = avfilter_get_by_name("scale"))) {
++                            av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
++                                   "not present, cannot convert pixel formats.\n");
++                            return AVERROR(EINVAL);
++                        }
++
++                        snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
++                                 scaler_count++);
++
++                        if ((ret = avfilter_graph_create_filter(&convert, filter,
++                                                                inst_name, graph->scale_sws_opts, NULL,
++                                                                graph)) < 0)
++                            return ret;
++                    }
+                     break;
+                 case AVMEDIA_TYPE_AUDIO:
+                     if (!(filter = avfilter_get_by_name("aresample"))) {
+@@ -589,6 +635,13 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
+                                                 outlink->outcfg.samplerates) ||
+                      CHECKED_MERGE(channel_layouts, outlink->incfg.channel_layouts,
+                                                     outlink->outcfg.channel_layouts))) {
++                    // Try adding an unsand filter & see if that helps
++                    if (ret < 0 && can_retry) {
++                        link = outlink;
++                        convert_needed = 1;
++                        continue;
++                    }
++
+                     if (ret < 0)
+                         return ret;
+                     av_log(log_ctx, AV_LOG_ERROR,
+diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
+index 15d897cff6..c134759bbf 100644
+--- a/libavfilter/buffersink.c
++++ b/libavfilter/buffersink.c
+@@ -58,6 +58,11 @@ typedef struct BufferSinkContext {
+     int sample_rates_size;
+ 
+     AVFrame *peeked_frame;
++
++    union {
++        av_buffersink_alloc_video_frame * video;
++    } alloc_cb;
++    void * alloc_v;
+ } BufferSinkContext;
+ 
+ #define NB_ITEMS(list) (list ## _size / sizeof(*list))
+@@ -148,6 +153,22 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
+     return get_frame_internal(ctx, frame, 0, nb_samples);
+ }
+ 
++static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
++{
++    AVFilterContext * const ctx = link->dst;
++    BufferSinkContext * const bs = ctx->priv;
++    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
++        ff_default_get_video_buffer(link, w, h);
++}
++
++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
++{
++    BufferSinkContext * const bs = ctx->priv;
++    bs->alloc_cb.video = cb;
++    bs->alloc_v = v;
++    return 0;
++}
++
+ #if FF_API_BUFFERSINK_ALLOC
+ AVBufferSinkParams *av_buffersink_params_alloc(void)
+ {
+@@ -331,6 +352,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
+     {
+         .name = "default",
+         .type = AVMEDIA_TYPE_VIDEO,
++        .get_video_buffer = alloc_video_buffer,
+     },
+     { NULL }
+ };
+diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
+index 69ed0f29a8..a3aa6fcb3c 100644
+--- a/libavfilter/buffersink.h
++++ b/libavfilter/buffersink.h
+@@ -198,6 +198,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
+  */
+ int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);
+ 
++typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
++int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
++
+ /**
+  * @}
+  */
+diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
+index da1cf9941e..c588ed23cb 100644
+--- a/libavfilter/buffersrc.c
++++ b/libavfilter/buffersrc.c
+@@ -188,7 +188,7 @@ int attribute_align_arg av_buffersrc_add_frame_flags(AVFilterContext *ctx, AVFra
+ 
+         switch (ctx->outputs[0]->type) {
+         case AVMEDIA_TYPE_VIDEO:
+-            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
++            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
+                                      frame->format, frame->pts);
+             break;
+         case AVMEDIA_TYPE_AUDIO:
+diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
+index 889ff772ed..496cec72ef 100644
+--- a/libavfilter/bwdif.h
++++ b/libavfilter/bwdif.h
+@@ -35,8 +35,29 @@ typedef struct BWDIFContext {
+     void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                         int parity, int clip_max, int spat);
++    void (*filter_line3)(void *dst, int dstride,
++                         const void *prev, const void *cur, const void *next, int prefs,
++                         int w, int parity, int clip_max);
+ } BWDIFContext;
+ 
+-void ff_bwdif_init_x86(BWDIFContext *bwdif);
++void ff_bwdif_init_filter_line(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth);
++void ff_bwdif_init_aarch64(BWDIFContext *bwdif, int bit_depth);
++
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat);
++
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max);
++
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max);
++
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max);
+ 
+ #endif /* AVFILTER_BWDIF_H */
+diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
+index b6aed7a450..b268113271 100644
+--- a/libavfilter/vf_bwdif.c
++++ b/libavfilter/vf_bwdif.c
+@@ -123,8 +123,8 @@ typedef struct ThreadData {
+         next2++; \
+     }
+ 
+-static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+-                         int prefs3, int mrefs3, int parity, int clip_max)
++void ff_bwdif_filter_intra_c(void *dst1, void *cur1, int w, int prefs, int mrefs,
++                             int prefs3, int mrefs3, int parity, int clip_max)
+ {
+     uint8_t *dst = dst1;
+     uint8_t *cur = cur1;
+@@ -133,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
+     FILTER_INTRA()
+ }
+ 
+-static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+-                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                          int prefs3, int mrefs3, int prefs4, int mrefs4,
+-                          int parity, int clip_max)
++void ff_bwdif_filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int prefs3, int mrefs3, int prefs4, int mrefs4,
++                            int parity, int clip_max)
+ {
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -151,9 +151,34 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
+-static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
+-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
+-                        int parity, int clip_max, int spat)
++#define NEXT_LINE()\
++    dst += d_stride; \
++    prev += prefs; \
++    cur  += prefs; \
++    next += prefs;
++
++void ff_bwdif_filter_line3_c(void * dst1, int d_stride,
++                             const void * prev1, const void * cur1, const void * next1, int s_stride,
++                             int w, int parity, int clip_max)
++{
++    const int prefs = s_stride;
++    uint8_t * dst  = dst1;
++    const uint8_t * prev = prev1;
++    const uint8_t * cur  = cur1;
++    const uint8_t * next = next1;
++
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++    NEXT_LINE();
++    memcpy(dst, cur, w);
++    NEXT_LINE();
++    ff_bwdif_filter_line_c(dst, (void*)prev, (void*)cur, (void*)next, w,
++                           prefs, -prefs, prefs * 2, - prefs * 2, prefs * 3, -prefs * 3, prefs * 4, -prefs * 4, parity, clip_max);
++}
++
++void ff_bwdif_filter_edge_c(void *dst1, void *prev1, void *cur1, void *next1,
++                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                            int parity, int clip_max, int spat)
+ {
+     uint8_t *dst   = dst1;
+     uint8_t *prev  = prev1;
+@@ -213,6 +238,13 @@ static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+     FILTER2()
+ }
+ 
++// Round job start line down to multiple of 4 so that if filter_line3 exists
++// and the frame is a multiple of 4 high then filter_line will never be called
++static inline int job_start(const int jobnr, const int nb_jobs, const int h)
++{
++    return jobnr >= nb_jobs ? h : ((h * jobnr) / nb_jobs) & ~3;
++}
++
+ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+ {
+     BWDIFContext *s = ctx->priv;
+@@ -222,8 +254,8 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+     int clip_max = (1 << (yadif->csp->comp[td->plane].depth)) - 1;
+     int df = (yadif->csp->comp[td->plane].depth + 7) / 8;
+     int refs = linesize / df;
+-    int slice_start = (td->h *  jobnr   ) / nb_jobs;
+-    int slice_end   = (td->h * (jobnr+1)) / nb_jobs;
++    int slice_start = job_start(jobnr, nb_jobs, td->h);
++    int slice_end   = job_start(jobnr + 1, nb_jobs, td->h);
+     int y;
+ 
+     for (y = slice_start; y < slice_end; y++) {
+@@ -245,6 +277,11 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
+                                refs << 1, -(refs << 1),
+                                td->parity ^ td->tff, clip_max,
+                                (y < 2) || ((y + 3) > td->h) ? 0 : 1);
++            } else if (s->filter_line3 && y + 2 < slice_end && y + 6 < td->h) {
++                s->filter_line3(dst, td->frame->linesize[td->plane],
++                                prev, cur, next, linesize, td->w,
++                                td->parity ^ td->tff, clip_max);
++                y += 2;
+             } else {
+                 s->filter_line(dst, prev, cur, next, td->w,
+                                refs, -refs, refs << 1, -(refs << 1),
+@@ -280,7 +317,8 @@ static void filter(AVFilterContext *ctx, AVFrame *dstpic,
+         td.h     = h;
+         td.plane = i;
+ 
+-        ctx->internal->execute(ctx, filter_slice, &td, NULL, FFMIN(h, ff_filter_get_nb_threads(ctx)));
++        ctx->internal->execute(ctx, filter_slice, &td, NULL,
++                          FFMIN((h+3)/4, ff_filter_get_nb_threads(ctx)));
+     }
+     if (yadif->current_field == YADIF_FIELD_END) {
+         yadif->current_field = YADIF_FIELD_NORMAL;
+@@ -350,20 +388,29 @@ static int config_props(AVFilterLink *link)
+ 
+     yadif->csp = av_pix_fmt_desc_get(link->format);
+     yadif->filter = filter;
+-    if (yadif->csp->comp[0].depth > 8) {
++    ff_bwdif_init_filter_line(s, yadif->csp->comp[0].depth);
++
++    return 0;
++}
++
++av_cold void ff_bwdif_init_filter_line(BWDIFContext *s, int bit_depth)
++{
++    s->filter_line3 = 0;
++    if (bit_depth > 8) {
+         s->filter_intra = filter_intra_16bit;
+         s->filter_line  = filter_line_c_16bit;
+         s->filter_edge  = filter_edge_16bit;
+     } else {
+-        s->filter_intra = filter_intra;
+-        s->filter_line  = filter_line_c;
+-        s->filter_edge  = filter_edge;
++        s->filter_intra = ff_bwdif_filter_intra_c;
++        s->filter_line  = ff_bwdif_filter_line_c;
++        s->filter_edge  = ff_bwdif_filter_edge_c;
+     }
+ 
+-    if (ARCH_X86)
+-        ff_bwdif_init_x86(s);
+-
+-    return 0;
++#if ARCH_X86
++    ff_bwdif_init_x86(s, bit_depth);
++#elif ARCH_AARCH64
++    ff_bwdif_init_aarch64(s, bit_depth);
++#endif
+ }
+ 
+ 
+diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
+new file mode 100644
+index 0000000000..d4c11cfc51
+--- /dev/null
++++ b/libavfilter/vf_deinterlace_v4l2m2m.c
+@@ -0,0 +1,2115 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * deinterlace video filter - V4L2 M2M
++ */
++
++#include <drm_fourcc.h>
++
++#include <linux/videodev2.h>
++
++#include <dirent.h>
++#include <fcntl.h>
++#include <poll.h>
++#include <stdatomic.h>
++#include <stdio.h>
++#include <string.h>
++#include <sys/ioctl.h>
++#include <sys/mman.h>
++#include <unistd.h>
++
++#include "config.h"
++
++#include "libavutil/avassert.h"
++#include "libavutil/avstring.h"
++#include "libavutil/common.h"
++#include "libavutil/hwcontext.h"
++#include "libavutil/hwcontext_drm.h"
++#include "libavutil/internal.h"
++#include "libavutil/mathematics.h"
++#include "libavutil/opt.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/time.h"
++
++#define FF_INTERNAL_FIELDS 1
++#include "framequeue.h"
++#include "filters.h"
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "scale_eval.h"
++#include "video.h"
++
++#ifndef DRM_FORMAT_P030
++#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
++#endif
++
++// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
++// in drm_fourcc.h hopefully will be sometime in the future but until then...
++#ifndef V4L2_PIX_FMT_NV12_10_COL128
++#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
++#endif
++
++#ifndef V4L2_PIX_FMT_NV12_COL128
++#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
++#endif
++
++typedef struct V4L2Queue V4L2Queue;
++typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
++
++typedef enum filter_type_v4l2_e
++{
++    FILTER_V4L2_DEINTERLACE = 1,
++    FILTER_V4L2_SCALE,
++} filter_type_v4l2_t;
++
++typedef struct V4L2Buffer {
++    int enqueued;
++    int reenqueue;
++    struct v4l2_buffer buffer;
++    AVFrame frame;
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    int num_planes;
++    AVDRMFrameDescriptor drm_frame;
++    V4L2Queue *q;
++} V4L2Buffer;
++
++typedef struct V4L2Queue {
++    struct v4l2_format format;
++    struct v4l2_selection sel;
++    int eos;
++    int num_buffers;
++    V4L2Buffer *buffers;
++    const char * name;
++    DeintV4L2M2MContextShared *ctx;
++} V4L2Queue;
++
++typedef struct pts_stats_s
++{
++    void * logctx;
++    const char * name;  // For debug
++    unsigned int last_count;
++    unsigned int last_interval;
++    int64_t last_pts;
++} pts_stats_t;
++
++#define PTS_TRACK_SIZE 32
++typedef struct pts_track_el_s
++{
++    uint32_t n;
++    unsigned int interval;
++    AVFrame * props;
++} pts_track_el_t;
++
++typedef struct pts_track_s
++{
++    uint32_t n;
++    uint32_t last_n;
++    int got_2;
++    void * logctx;
++    pts_stats_t stats;
++    pts_track_el_t a[PTS_TRACK_SIZE];
++} pts_track_t;
++
++typedef enum drain_state_e
++{
++    DRAIN_NONE = 0,     // Not draining
++    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
++    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
++    DRAIN_EOS,          // Drain with long timeout EOS expected
++    DRAIN_DONE          // Drained
++} drain_state_t;
++
++typedef struct DeintV4L2M2MContextShared {
++    void * logctx;  // For logging - will be NULL when done
++    filter_type_v4l2_t filter_type;
++
++    int fd;
++    int done;   // fd closed - awating all refs dropped
++    int width;
++    int height;
++
++    int drain;          // EOS received (inlink status)
++    drain_state_t drain_state;
++    int64_t drain_pts;  // PTS associated with inline status
++
++    unsigned int frames_rx;
++    unsigned int frames_tx;
++
++    // from options
++    int output_width;
++    int output_height;
++    enum AVPixelFormat output_format;
++
++    int has_enc_stop;
++    // We expect to get exactly the same number of frames out as we put in
++    // We can drain by matching input to output
++    int one_to_one;
++
++    int orig_width;
++    int orig_height;
++    atomic_uint refcount;
++
++    AVBufferRef *hw_frames_ctx;
++
++    unsigned int field_order;
++
++    pts_track_t track;
++
++    V4L2Queue output;
++    V4L2Queue capture;
++} DeintV4L2M2MContextShared;
++
++typedef struct DeintV4L2M2MContext {
++    const AVClass *class;
++
++    DeintV4L2M2MContextShared *shared;
++
++    char * w_expr;
++    char * h_expr;
++    char * output_format_string;;
++
++    int force_original_aspect_ratio;
++    int force_divisible_by;
++
++    char *colour_primaries_string;
++    char *colour_transfer_string;
++    char *colour_matrix_string;
++    int   colour_range;
++    char *chroma_location_string;
++
++    enum AVColorPrimaries colour_primaries;
++    enum AVColorTransferCharacteristic colour_transfer;
++    enum AVColorSpace colour_matrix;
++    enum AVChromaLocation chroma_location;
++} DeintV4L2M2MContext;
++
++
++static inline int drain_frame_expected(const drain_state_t d)
++{
++    return d == DRAIN_EOS || d == DRAIN_LAST;
++}
++
++// These just list the ones we know we can cope with
++static uint32_t
++fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
++{
++    switch (avfmt) {
++    case AV_PIX_FMT_YUV420P:
++        return V4L2_PIX_FMT_YUV420;
++    case AV_PIX_FMT_NV12:
++        return V4L2_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case AV_PIX_FMT_RPI4_8:
++    case AV_PIX_FMT_SAND128:
++        return V4L2_PIX_FMT_NV12_COL128;
++#endif
++    default:
++        break;
++    }
++    return 0;
++}
++
++static enum AVPixelFormat
++fmt_v4l2_to_av(const uint32_t pixfmt)
++{
++    switch (pixfmt) {
++    case V4L2_PIX_FMT_YUV420:
++        return AV_PIX_FMT_YUV420P;
++    case V4L2_PIX_FMT_NV12:
++        return AV_PIX_FMT_NV12;
++#if CONFIG_SAND
++    case V4L2_PIX_FMT_NV12_COL128:
++        return AV_PIX_FMT_RPI4_8;
++#endif
++    default:
++        break;
++    }
++    return AV_PIX_FMT_NONE;
++}
++
++static unsigned int pts_stats_interval(const pts_stats_t * const stats)
++{
++    return stats->last_interval;
++}
++
++// Pick 64 for max last count - that is >1sec at 60fps
++#define STATS_LAST_COUNT_MAX 64
++#define STATS_INTERVAL_MAX (1 << 30)
++static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
++{
++    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
++        if (stats->last_count < STATS_LAST_COUNT_MAX)
++            ++stats->last_count;
++        return;
++    }
++
++    if (stats->last_pts != AV_NOPTS_VALUE) {
++        const int64_t interval = pts - stats->last_pts;
++
++        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
++            stats->last_count >= STATS_LAST_COUNT_MAX) {
++            if (stats->last_interval != 0)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
++                       __func__, stats->name, interval, stats->last_count);
++            stats->last_interval = 0;
++        }
++        else {
++            const int64_t frame_time = interval / (int64_t)stats->last_count;
++
++            if (frame_time != stats->last_interval)
++                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
++                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
++            stats->last_interval = frame_time;
++        }
++    }
++
++    stats->last_pts = pts;
++    stats->last_count = 1;
++}
++
++static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
++{
++    *stats = (pts_stats_t){
++        .logctx = logctx,
++        .name = name,
++        .last_count = 1,
++        .last_interval = 0,
++        .last_pts = AV_NOPTS_VALUE
++    };
++}
++
++static inline uint32_t pts_track_next_n(pts_track_t * const trk)
++{
++    if (++trk->n == 0)
++        trk->n = 1;
++    return trk->n;
++}
++
++static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
++{
++    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
++    pts_track_el_t * t;
++
++    // As a first guess assume that n==0 means last frame
++    if (n == 0) {
++        n = trk->last_n;
++        if (n == 0)
++            goto fail;
++    }
++
++    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    if (t->n != n) {
++        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
++        goto fail;
++    }
++
++    // 1st frame is simple - just believe it
++    if (n != trk->last_n) {
++        trk->last_n = n;
++        trk->got_2 = 0;
++        return av_frame_copy_props(dst, t->props);
++    }
++
++    // Only believe in a single interpolated frame
++    if (trk->got_2)
++        goto fail;
++    trk->got_2 = 1;
++
++    av_frame_copy_props(dst, t->props);
++
++
++    // If we can't guess - don't
++    if (t->interval == 0) {
++        dst->best_effort_timestamp = AV_NOPTS_VALUE;
++        dst->pts = AV_NOPTS_VALUE;
++        dst->pkt_dts = AV_NOPTS_VALUE;
++    }
++    else {
++        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
++            dst->best_effort_timestamp += t->interval / 2;
++        if (dst->pts != AV_NOPTS_VALUE)
++            dst->pts += t->interval / 2;
++        if (dst->pkt_dts != AV_NOPTS_VALUE)
++            dst->pkt_dts += t->interval / 2;
++    }
++
++    return 0;
++
++fail:
++    trk->last_n = 0;
++    trk->got_2 = 0;
++    dst->pts = AV_NOPTS_VALUE;
++    dst->pkt_dts = AV_NOPTS_VALUE;
++    return 0;
++}
++
++// We are only ever expecting in-order frames so nothing more clever is required
++static unsigned int
++pts_track_count(const pts_track_t * const trk)
++{
++    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
++}
++
++static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
++{
++    const uint32_t n = pts_track_next_n(trk);
++    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
++
++    pts_stats_add(&trk->stats, src->pts);
++
++    t->n = n;
++    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
++    av_frame_unref(t->props);
++    av_frame_copy_props(t->props, src);
++
++    // We now know what the previous interval was, rather than having to guess,
++    // so set it.  There is a better than decent chance that this is before
++    // we use it.
++    if (t->interval != 0) {
++        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
++        prev_t->interval = t->interval;
++    }
++
++    // In case deinterlace interpolates frames use every other usec
++    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
++}
++
++static void pts_track_uninit(pts_track_t * const trk)
++{
++    unsigned int i;
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        av_frame_free(&trk->a[i].props);
++    }
++}
++
++static int pts_track_init(pts_track_t * const trk, void *logctx)
++{
++    unsigned int i;
++    trk->n = 1;
++    pts_stats_init(&trk->stats, logctx, "track");
++    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
++        trk->a[i].n = 0;
++        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
++            pts_track_uninit(trk);
++            return AVERROR(ENOMEM);
++        }
++    }
++    return 0;
++}
++
++static inline uint32_t
++fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
++}
++
++static inline uint32_t
++fmt_height(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
++}
++
++static inline uint32_t
++fmt_width(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
++}
++
++static inline uint32_t
++fmt_pixelformat(const struct v4l2_format * const fmt)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
++}
++
++static inline uint32_t
++buf_bytesused0(const struct v4l2_buffer * const buf)
++{
++    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
++}
++
++static void
++init_format(V4L2Queue * const q, const uint32_t format_type)
++{
++    memset(&q->format, 0, sizeof(q->format));
++    memset(&q->sel,    0, sizeof(q->sel));
++    q->format.type = format_type;
++    q->sel.type    = format_type;
++}
++
++static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
++{
++    struct v4l2_capability cap;
++    int ret;
++
++    memset(&cap, 0, sizeof(cap));
++    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
++    if (ret < 0)
++        return ret;
++
++    if (ctx->filter_type == FILTER_V4L2_SCALE &&
++        strcmp("bcm2835-codec-isp", cap.card) != 0)
++    {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
++        return AVERROR(EINVAL);
++    }
++
++    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
++        return AVERROR(EINVAL);
++    }
++
++    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
++        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
++        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
++    }
++    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
++        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
++        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
++    }
++    else {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
++        return AVERROR(EINVAL);
++    }
++
++    return 0;
++}
++
++// Just use for probe - doesn't modify q format
++static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
++{
++    struct v4l2_format fmt         = {.type = queue->format.type};
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    int ret, field;
++    // Pick YUV to test with if not otherwise specified
++    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
++    enum AVPixelFormat r_avfmt;
++
++
++    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
++    if (ret)
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
++
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
++        field = V4L2_FIELD_INTERLACED_TB;
++    else
++        field = V4L2_FIELD_NONE;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        fmt.fmt.pix_mp.pixelformat = pixelformat;
++        fmt.fmt.pix_mp.field = field;
++        fmt.fmt.pix_mp.width = width;
++        fmt.fmt.pix_mp.height = height;
++    } else {
++        fmt.fmt.pix.pixelformat = pixelformat;
++        fmt.fmt.pix.field = field;
++        fmt.fmt.pix.width = width;
++        fmt.fmt.pix.height = height;
++    }
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
++         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
++         fmt.fmt.pix_mp.pixelformat,
++         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
++    if (ret)
++        return AVERROR(EINVAL);
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
++         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
++         fmt.fmt.pix_mp.pixelformat,
++         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
++
++    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
++    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
++        return AVERROR(EINVAL);
++    }
++    if (r_avfmt == AV_PIX_FMT_NONE) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
++        return AVERROR(EINVAL);
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
++        if (fmt.fmt.pix_mp.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
++
++            return AVERROR(EINVAL);
++        }
++    } else {
++        if (fmt.fmt.pix.field != field) {
++            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
++
++            return AVERROR(EINVAL);
++        }
++    }
++
++    return 0;
++}
++
++static int
++do_s_fmt(V4L2Queue * const q)
++{
++    DeintV4L2M2MContextShared * const ctx = q->ctx;
++    const uint32_t pixelformat = fmt_pixelformat(&q->format);
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
++        return ret;
++    }
++
++    if (pixelformat != fmt_pixelformat(&q->format)) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
++        return AVERROR(EINVAL);
++    }
++
++    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
++    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
++
++    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
++    if (ret) {
++        ret = AVERROR(errno);
++        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
++    }
++
++    return 0;
++}
++
++static void
++set_fmt_color(struct v4l2_format *const fmt,
++               const enum AVColorPrimaries avcp,
++               const enum AVColorSpace avcs,
++               const enum AVColorTransferCharacteristic avxc)
++{
++    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
++    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
++    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
++
++    switch (avcp) {
++    case AVCOL_PRI_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        ycbcr = V4L2_YCBCR_ENC_709;
++        break;
++    case AVCOL_PRI_BT470M:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        ycbcr = V4L2_YCBCR_ENC_601;
++        break;
++    case AVCOL_PRI_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_PRI_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_PRI_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_PRI_BT2020:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    case AVCOL_PRI_SMPTE428:
++    case AVCOL_PRI_SMPTE431:
++    case AVCOL_PRI_SMPTE432:
++    case AVCOL_PRI_EBU3213:
++    case AVCOL_PRI_RESERVED:
++    case AVCOL_PRI_FILM:
++    case AVCOL_PRI_UNSPECIFIED:
++    default:
++        break;
++    }
++
++    switch (avcs) {
++    case AVCOL_SPC_RGB:
++        cs = V4L2_COLORSPACE_SRGB;
++        break;
++    case AVCOL_SPC_BT709:
++        cs = V4L2_COLORSPACE_REC709;
++        break;
++    case AVCOL_SPC_FCC:
++        cs = V4L2_COLORSPACE_470_SYSTEM_M;
++        break;
++    case AVCOL_SPC_BT470BG:
++        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
++        break;
++    case AVCOL_SPC_SMPTE170M:
++        cs = V4L2_COLORSPACE_SMPTE170M;
++        break;
++    case AVCOL_SPC_SMPTE240M:
++        cs = V4L2_COLORSPACE_SMPTE240M;
++        break;
++    case AVCOL_SPC_BT2020_CL:
++        cs = V4L2_COLORSPACE_BT2020;
++        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
++        break;
++    case AVCOL_SPC_BT2020_NCL:
++        cs = V4L2_COLORSPACE_BT2020;
++        break;
++    default:
++        break;
++    }
++
++    switch (xfer) {
++    case AVCOL_TRC_BT709:
++        xfer = V4L2_XFER_FUNC_709;
++        break;
++    case AVCOL_TRC_IEC61966_2_1:
++        xfer = V4L2_XFER_FUNC_SRGB;
++        break;
++    case AVCOL_TRC_SMPTE240M:
++        xfer = V4L2_XFER_FUNC_SMPTE240M;
++        break;
++    case AVCOL_TRC_SMPTE2084:
++        xfer = V4L2_XFER_FUNC_SMPTE2084;
++        break;
++    default:
++        break;
++    }
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.colorspace = cs;
++        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
++        fmt->fmt.pix_mp.xfer_func = xfer;
++    } else {
++        fmt->fmt.pix.colorspace = cs;
++        fmt->fmt.pix.ycbcr_enc = ycbcr;
++        fmt->fmt.pix.xfer_func = xfer;
++    }
++}
++
++static void
++set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
++{
++    const enum v4l2_quantization q =
++        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
++        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
++            V4L2_QUANTIZATION_DEFAULT;
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.quantization = q;
++    } else {
++        fmt->fmt.pix.quantization = q;
++    }
++}
++
++static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    switch(ycbcr) {
++    case V4L2_YCBCR_ENC_XV709:
++    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
++    case V4L2_YCBCR_ENC_XV601:
++    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
++    default:
++        break;
++    }
++
++    switch(cs) {
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
++    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
++    default:
++        break;
++    }
++
++    return AVCOL_PRI_UNSPECIFIED;
++}
++
++static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    switch(cs) {
++    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
++    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
++    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
++    case V4L2_COLORSPACE_BT2020:
++        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
++            return AVCOL_SPC_BT2020_CL;
++        else
++             return AVCOL_SPC_BT2020_NCL;
++    default:
++        break;
++    }
++
++    return AVCOL_SPC_UNSPECIFIED;
++}
++
++static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
++{
++    enum v4l2_ycbcr_encoding ycbcr;
++    enum v4l2_xfer_func xfer;
++    enum v4l2_colorspace cs;
++
++    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.colorspace :
++        fmt->fmt.pix.colorspace;
++
++    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.ycbcr_enc:
++        fmt->fmt.pix.ycbcr_enc;
++
++    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.xfer_func:
++        fmt->fmt.pix.xfer_func;
++
++    switch (xfer) {
++    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
++    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
++    default:
++        break;
++    }
++
++    switch (cs) {
++    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
++    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
++    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
++    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
++    default:
++        break;
++    }
++
++    switch (ycbcr) {
++    case V4L2_YCBCR_ENC_XV709:
++    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
++    default:
++        break;
++    }
++
++    return AVCOL_TRC_UNSPECIFIED;
++}
++
++static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
++{
++    enum v4l2_quantization qt;
++
++    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
++        fmt->fmt.pix_mp.quantization :
++        fmt->fmt.pix.quantization;
++
++    switch (qt) {
++    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
++    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
++    default:
++        break;
++    }
++
++     return AVCOL_RANGE_UNSPECIFIED;
++}
++
++static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
++{
++    struct v4l2_format *const format = &q->format;
++    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
++
++    const uint32_t drm_fmt = src->layers[0].format;
++    // Treat INVALID as LINEAR
++    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
++        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
++    uint32_t pix_fmt = 0;
++    uint32_t w = 0;
++    uint32_t h = 0;
++    uint32_t bpl = src->layers[0].planes[0].pitch;
++
++    // We really don't expect multiple layers
++    // All formats that we currently cope with are single object
++
++    if (src->nb_layers != 1 || src->nb_objects != 1)
++        return AVERROR(EINVAL);
++
++    switch (drm_fmt) {
++        case DRM_FORMAT_YUV420:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 3)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_YUV420;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++            break;
++
++        case DRM_FORMAT_NV12:
++            if (mod == DRM_FORMAT_MOD_LINEAR) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12;
++                h = src->layers[0].planes[1].offset / bpl;
++                w = bpl;
++            }
++#if CONFIG_SAND
++            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
++                w = bpl;
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++#endif
++            break;
++
++        case DRM_FORMAT_P030:
++#if CONFIG_SAND
++            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
++                if (src->layers[0].nb_planes != 2)
++                    break;
++                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
++                w = bpl / 2;  // Matching lie to how we construct this
++                h = src->layers[0].planes[1].offset / 128;
++                bpl = fourcc_mod_broadcom_param(mod);
++            }
++#endif
++            break;
++
++        default:
++            break;
++    }
++
++    if (!pix_fmt)
++        return AVERROR(EINVAL);
++
++    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
++        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->plane_fmt[0].bytesperline = bpl;
++        pix->num_planes = 1;
++    }
++    else {
++        struct v4l2_pix_format *const pix = &format->fmt.pix;
++
++        pix->width = w;
++        pix->height = h;
++        pix->pixelformat = pix_fmt;
++        pix->bytesperline = bpl;
++    }
++
++    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
++    set_fmt_color_range(format, frame->color_range);
++
++    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
++    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
++    q->sel.r.left = frame->crop_left;
++    q->sel.r.top = frame->crop_top;
++
++    return 0;
++}
++
++
++static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
++{
++    struct v4l2_format * const fmt   = &queue->format;
++    struct v4l2_selection *const sel = &queue->sel;
++
++    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
++
++    // Align w/h to 16 here in case there are alignment requirements at the next
++    // stage of the filter chain (also RPi deinterlace setup is bust and this
++    // fixes it)
++    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
++        fmt->fmt.pix_mp.pixelformat = pixelformat;
++        fmt->fmt.pix_mp.field = field;
++        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
++        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
++    } else {
++        fmt->fmt.pix.pixelformat = pixelformat;
++        fmt->fmt.pix.field = field;
++        fmt->fmt.pix.width = FFALIGN(width, 16);
++        fmt->fmt.pix.height = FFALIGN(height, 16);
++    }
++
++    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
++    set_fmt_color_range(fmt, priv->colour_range);
++
++    sel->r.width = width;
++    sel->r.height = height;
++    sel->r.left = 0;
++    sel->r.top = 0;
++
++    return do_s_fmt(queue);
++}
++
++static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
++{
++    int ret;
++
++    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
++    if (ctx->fd < 0)
++        return AVERROR(errno);
++
++    ret = deint_v4l2m2m_prepare_context(ctx);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
++        goto fail;
++    }
++
++    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
++        goto fail;
++    }
++
++    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
++        goto fail;
++    }
++
++    return 0;
++
++fail:
++    close(ctx->fd);
++    ctx->fd = -1;
++
++    return ret;
++}
++
++static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
++{
++    int ret = AVERROR(EINVAL);
++    struct dirent *entry;
++    char node[PATH_MAX];
++    DIR *dirp;
++
++    dirp = opendir("/dev");
++    if (!dirp)
++        return AVERROR(errno);
++
++    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
++
++        if (strncmp(entry->d_name, "video", 5))
++            continue;
++
++        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
++        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
++        ret = deint_v4l2m2m_probe_device(ctx, node);
++        if (!ret)
++            break;
++    }
++
++    closedir(dirp);
++
++    if (ret) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
++        ctx->fd = -1;
++
++        return ret;
++    }
++
++    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
++{
++    int ret;
++
++    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
++    if (ret < 0)
++        return AVERROR(errno);
++
++    buf->enqueued = 1;
++
++    return 0;
++}
++
++static void
++drm_frame_init(AVDRMFrameDescriptor * const d)
++{
++    unsigned int i;
++    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
++        d->objects[i].fd = -1;
++    }
++}
++
++static void
++drm_frame_uninit(AVDRMFrameDescriptor * const d)
++{
++    unsigned int i;
++    for (i = 0; i != d->nb_objects; ++i) {
++        if (d->objects[i].fd != -1) {
++            close(d->objects[i].fd);
++            d->objects[i].fd = -1;
++        }
++    }
++}
++
++static void
++avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
++{
++    unsigned int i;
++    V4L2Buffer* const avbufs = *ppavbufs;
++
++    if (avbufs == NULL)
++        return;
++    *ppavbufs = NULL;
++
++    for (i = 0; i != n; ++i) {
++        V4L2Buffer* const avbuf = avbufs + i;
++        drm_frame_uninit(&avbuf->drm_frame);
++    }
++
++    av_free(avbufs);
++}
++
++static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
++{
++    struct v4l2_exportbuffer expbuf;
++    int i, ret;
++    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
++
++    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
++    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
++    const struct v4l2_format *const fmt = &q->format;
++    const uint32_t height = fmt_height(fmt);
++    ptrdiff_t bpl0;
++
++    /* fill the DRM frame descriptor */
++    drm_desc->nb_layers = 1;
++    layer->nb_planes = avbuf->num_planes;
++
++    for (int i = 0; i < avbuf->num_planes; i++) {
++        layer->planes[i].object_index = i;
++        layer->planes[i].offset = 0;
++        layer->planes[i].pitch = fmt_bpl(fmt, i);
++    }
++    bpl0 = layer->planes[0].pitch;
++
++    switch (fmt_pixelformat(fmt)) {
++#if CONFIG_SAND
++        case V4L2_PIX_FMT_NV12_COL128:
++            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
++            layer->format = V4L2_PIX_FMT_NV12;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 2;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = height * 128;
++            layer->planes[0].pitch = fmt_width(fmt);
++            layer->planes[1].pitch = layer->planes[0].pitch;
++            break;
++#endif
++
++        case DRM_FORMAT_NV12:
++            layer->format = V4L2_PIX_FMT_NV12;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 2;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = bpl0 * height;
++            layer->planes[1].pitch = bpl0;
++            break;
++
++        case V4L2_PIX_FMT_YUV420:
++            layer->format = DRM_FORMAT_YUV420;
++
++            if (avbuf->num_planes > 1)
++                break;
++
++            layer->nb_planes = 3;
++            layer->planes[1].object_index = 0;
++            layer->planes[1].offset = bpl0 * height;
++            layer->planes[1].pitch = bpl0 / 2;
++            layer->planes[2].object_index = 0;
++            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
++            layer->planes[2].pitch = bpl0 / 2;
++            break;
++
++        default:
++            drm_desc->nb_layers = 0;
++            return AVERROR(EINVAL);
++    }
++
++    drm_desc->nb_objects = 0;
++    for (i = 0; i < avbuf->num_planes; i++) {
++        memset(&expbuf, 0, sizeof(expbuf));
++
++        expbuf.index = avbuf->buffer.index;
++        expbuf.type = avbuf->buffer.type;
++        expbuf.plane = i;
++
++        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
++        if (ret < 0)
++            return AVERROR(errno);
++
++        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
++            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
++        drm_desc->objects[i].fd = expbuf.fd;
++        drm_desc->objects[i].format_modifier = mod;
++        drm_desc->nb_objects = i + 1;
++    }
++
++    return 0;
++}
++
++static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
++{
++    struct v4l2_format *fmt = &queue->format;
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_requestbuffers req;
++    int ret, i, multiplanar;
++    uint32_t memory;
++
++    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
++        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
++
++    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
++
++    memset(&req, 0, sizeof(req));
++    req.count = queue->num_buffers;
++    req.memory = memory;
++    req.type = fmt->type;
++
++    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
++    if (ret < 0) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
++
++        return AVERROR(errno);
++    }
++
++    queue->num_buffers = req.count;
++    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
++    if (!queue->buffers) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
++
++        return AVERROR(ENOMEM);
++    }
++
++    for (i = 0; i < queue->num_buffers; i++) {
++        V4L2Buffer * const buf = &queue->buffers[i];
++
++        buf->enqueued = 0;
++        buf->q = queue;
++
++        buf->buffer.type = fmt->type;
++        buf->buffer.memory = memory;
++        buf->buffer.index = i;
++
++        if (multiplanar) {
++            buf->buffer.length = VIDEO_MAX_PLANES;
++            buf->buffer.m.planes = buf->planes;
++        }
++
++        drm_frame_init(&buf->drm_frame);
++    }
++
++    for (i = 0; i < queue->num_buffers; i++) {
++        V4L2Buffer * const buf = &queue->buffers[i];
++
++        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
++        if (ret < 0) {
++            ret = AVERROR(errno);
++
++            goto fail;
++        }
++
++        buf->num_planes = multiplanar ? buf->buffer.length : 1;
++
++        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
++            ret = deint_v4l2m2m_enqueue_buffer(buf);
++            if (ret)
++                goto fail;
++
++            ret = v4l2_buffer_export_drm(queue, buf);
++            if (ret)
++                goto fail;
++        }
++    }
++
++    return 0;
++
++fail:
++    avbufs_delete(&queue->buffers, queue->num_buffers);
++    queue->num_buffers = 0;
++    return ret;
++}
++
++static int deint_v4l2m2m_streamon(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
++{
++    DeintV4L2M2MContextShared * const ctx = queue->ctx;
++    int type = queue->format.type;
++    int ret;
++
++    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
++    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
++    if (ret < 0)
++        return AVERROR(errno);
++
++    return 0;
++}
++
++// timeout in ms
++static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
++{
++    struct v4l2_plane planes[VIDEO_MAX_PLANES];
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    struct v4l2_buffer buf = { 0 };
++    V4L2Buffer* avbuf = NULL;
++    struct pollfd pfd;
++    short events;
++    int ret;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        events =  POLLOUT | POLLWRNORM;
++    else
++        events = POLLIN | POLLRDNORM;
++
++    pfd.events = events;
++    pfd.fd = ctx->fd;
++
++    for (;;) {
++        ret = poll(&pfd, 1, timeout);
++        if (ret > 0)
++            break;
++        if (errno == EINTR)
++            continue;
++        return NULL;
++    }
++
++    if (pfd.revents & POLLERR)
++        return NULL;
++
++    if (pfd.revents & events) {
++        memset(&buf, 0, sizeof(buf));
++        buf.memory = V4L2_MEMORY_MMAP;
++        buf.type = queue->format.type;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memset(planes, 0, sizeof(planes));
++            buf.length = VIDEO_MAX_PLANES;
++            buf.m.planes = planes;
++        }
++
++        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
++        if (ret) {
++            if (errno != EAGAIN)
++                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
++                       av_err2str(AVERROR(errno)));
++            return NULL;
++        }
++
++        avbuf = &queue->buffers[buf.index];
++        avbuf->enqueued = 0;
++        avbuf->buffer = buf;
++        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
++            memcpy(avbuf->planes, planes, sizeof(planes));
++            avbuf->buffer.m.planes = avbuf->planes;
++        }
++        return avbuf;
++    }
++
++    return NULL;
++}
++
++static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (!queue->buffers[i].enqueued) {
++            buf = &queue->buffers[i];
++            break;
++        }
++    return buf;
++}
++
++static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
++{
++    int i;
++    V4L2Buffer *buf = NULL;
++
++    if (!queue || !queue->buffers)
++        return;
++    for (i = 0; i < queue->num_buffers; i++) {
++        buf = &queue->buffers[i];
++        if (queue->buffers[i].enqueued)
++            av_frame_unref(&buf->frame);
++    }
++}
++
++static void recycle_q(V4L2Queue * const queue)
++{
++    V4L2Buffer* avbuf;
++    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
++        av_frame_unref(&avbuf->frame);
++    }
++}
++
++static int count_enqueued(V4L2Queue *queue)
++{
++    int i;
++    int n = 0;
++
++    if (queue->buffers == NULL)
++        return 0;
++
++    for (i = 0; i < queue->num_buffers; i++)
++        if (queue->buffers[i].enqueued)
++            ++n;
++    return n;
++}
++
++static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
++{
++    DeintV4L2M2MContextShared *const ctx = queue->ctx;
++    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
++    V4L2Buffer *buf;
++    int i;
++
++    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
++        recycle_q(queue);
++
++    buf = deint_v4l2m2m_find_free_buf(queue);
++    if (!buf) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
++        return AVERROR(EAGAIN);
++    }
++    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
++        for (i = 0; i < drm_desc->nb_objects; i++)
++            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
++    else
++        buf->buffer.m.fd = drm_desc->objects[0].fd;
++
++    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
++        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
++            V4L2_FIELD_INTERLACED_BT;
++
++    if (ctx->field_order != buf->buffer.field) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
++        ctx->field_order = buf->buffer.field;
++    }
++
++    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
++
++    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
++
++    av_frame_move_ref(&buf->frame, frame);
++
++    return deint_v4l2m2m_enqueue_buffer(buf);
++}
++
++static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
++{
++    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
++        V4L2Queue *capture = &ctx->capture;
++        V4L2Queue *output  = &ctx->output;
++
++        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
++
++        if (ctx->fd >= 0) {
++            deint_v4l2m2m_streamoff(capture);
++            deint_v4l2m2m_streamoff(output);
++        }
++
++        avbufs_delete(&capture->buffers, capture->num_buffers);
++
++        deint_v4l2m2m_unref_queued(output);
++
++        av_buffer_unref(&ctx->hw_frames_ctx);
++
++        if (capture->buffers)
++            av_free(capture->buffers);
++
++        if (output->buffers)
++            av_free(output->buffers);
++
++        if (ctx->fd >= 0) {
++            close(ctx->fd);
++            ctx->fd = -1;
++        }
++
++        av_free(ctx);
++    }
++}
++
++static void v4l2_free_buffer(void *opaque, uint8_t *unused)
++{
++    V4L2Buffer *buf                = opaque;
++    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
++
++    if (!ctx->done)
++        deint_v4l2m2m_enqueue_buffer(buf);
++
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++// timeout in ms
++static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
++{
++    DeintV4L2M2MContextShared *ctx = queue->ctx;
++    V4L2Buffer* avbuf;
++    enum AVColorPrimaries color_primaries;
++    enum AVColorSpace colorspace;
++    enum AVColorTransferCharacteristic color_trc;
++    enum AVColorRange color_range;
++
++    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    if (queue->eos) {
++        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
++        return AVERROR_EOF;
++    }
++
++    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
++    if (!avbuf) {
++        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
++        return AVERROR(EAGAIN);
++    }
++
++    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
++        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
++            queue->eos = 1;
++        if (buf_bytesused0(&avbuf->buffer) == 0)
++            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
++    }
++
++    // Fill in PTS and anciliary info from src frame
++    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
++
++    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
++                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
++                            avbuf, AV_BUFFER_FLAG_READONLY);
++    if (!frame->buf[0]) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++
++    atomic_fetch_add(&ctx->refcount, 1);
++
++    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
++    frame->format = AV_PIX_FMT_DRM_PRIME;
++    if (ctx->hw_frames_ctx)
++        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
++    frame->height = ctx->output_height;
++    frame->width = ctx->output_width;
++
++    color_primaries = get_color_primaries(&ctx->capture.format);
++    colorspace      = get_color_space(&ctx->capture.format);
++    color_trc       = get_color_trc(&ctx->capture.format);
++    color_range     = get_color_range(&ctx->capture.format);
++
++    // If the color parameters are unspecified by V4L2 then leave alone as they
++    // will have been copied from src
++    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
++        frame->color_primaries = color_primaries;
++    if (colorspace != AVCOL_SPC_UNSPECIFIED)
++        frame->colorspace = colorspace;
++    if (color_trc != AVCOL_TRC_UNSPECIFIED)
++        frame->color_trc = color_trc;
++    if (color_range != AVCOL_RANGE_UNSPECIFIED)
++        frame->color_range = color_range;
++
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
++        // Not interlaced now
++        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
++        frame->top_field_first = 0;
++        // Pkt duration halved
++        frame->pkt_duration /= 2;
++    }
++
++    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
++        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
++        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
++    }
++
++    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
++    return 0;
++}
++
++static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
++{
++    AVFilterLink *inlink           = outlink->src->inputs[0];
++    AVFilterContext *avctx         = outlink->src;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    int ret;
++
++    ctx->height = avctx->inputs[0]->h;
++    ctx->width = avctx->inputs[0]->w;
++
++    if (ctx->filter_type == FILTER_V4L2_SCALE) {
++        if ((ret = ff_scale_eval_dimensions(priv,
++                                            priv->w_expr, priv->h_expr,
++                                            inlink, outlink,
++                                            &ctx->output_width, &ctx->output_height)) < 0)
++            return ret;
++
++        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
++                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
++    }
++    else {
++        ctx->output_width  = ctx->width;
++        ctx->output_height = ctx->height;
++    }
++
++    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
++           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
++           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);
++
++    outlink->time_base           = inlink->time_base;
++    outlink->w                   = ctx->output_width;
++    outlink->h                   = ctx->output_height;
++    outlink->format              = inlink->format;
++    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
++        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};
++
++    if (inlink->sample_aspect_ratio.num)
++        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
++    else
++        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
++
++    ret = deint_v4l2m2m_find_device(ctx);
++    if (ret)
++        return ret;
++
++    if (inlink->hw_frames_ctx) {
++        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
++        if (!ctx->hw_frames_ctx)
++            return AVERROR(ENOMEM);
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_query_formats(AVFilterContext *avctx)
++{
++    static const enum AVPixelFormat pixel_formats[] = {
++        AV_PIX_FMT_DRM_PRIME,
++//        AV_PIX_FMT_YUV420P,
++        AV_PIX_FMT_NONE,
++    };
++
++    return ff_set_common_formats(avctx, ff_make_format_list(pixel_formats));
++}
++
++static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
++{
++    const uint64_t mod = drm_desc->objects[0].format_modifier;
++    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
++
++    // Only currently support single object things
++    if (drm_desc->nb_objects != 1)
++        return 0;
++
++    switch (drm_desc->layers[0].format) {
++    case DRM_FORMAT_YUV420:
++        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
++    case DRM_FORMAT_NV12:
++        return is_linear ? V4L2_PIX_FMT_NV12 :
++#if CONFIG_SAND
++            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
++#endif
++            0;
++    default:
++        break;
++    }
++    return 0;
++}
++
++static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterContext *avctx         = link->dst;
++    DeintV4L2M2MContext *priv      = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++    V4L2Queue *capture             = &ctx->capture;
++    V4L2Queue *output              = &ctx->output;
++    int ret;
++
++    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
++           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
++    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
++           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
++
++    if (ctx->field_order == V4L2_FIELD_ANY) {
++        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
++        uint32_t pixelformat = desc_pixelformat(drm_desc);
++
++        if (pixelformat == 0) {
++            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
++                   av_fourcc2str(drm_desc->layers[0].format),
++                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
++            return AVERROR(EINVAL);
++        }
++
++        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
++        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
++
++        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
++           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
++
++        if ((ret = set_src_fmt(output, in)) != 0) {
++            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
++                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
++            return ret;
++        }
++
++        ret = do_s_fmt(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
++            return ret;
++        }
++
++        if (ctx->output_format != AV_PIX_FMT_NONE)
++           pixelformat = fmt_av_to_v4l2(ctx->output_format);
++        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_allocate_buffers(capture);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_streamon(capture);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_allocate_buffers(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
++            return ret;
++        }
++
++        ret = deint_v4l2m2m_streamon(output);
++        if (ret) {
++            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
++            return ret;
++        }
++
++        if (in->top_field_first)
++            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
++        else
++            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
++
++        {
++            struct v4l2_encoder_cmd ecmd = {
++                .cmd = V4L2_ENC_CMD_STOP
++            };
++            ctx->has_enc_stop = 0;
++            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
++                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
++                ctx->has_enc_stop = 1;
++            }
++            else {
++                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
++            }
++
++        }
++    }
++
++    ret = deint_v4l2m2m_enqueue_frame(output, in);
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
++    return ret;
++}
++
++static int
++ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
++           AVFilterLink * const inlink)
++{
++    int instatus;
++    int64_t inpts;
++
++    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
++        return 0;
++
++    s->drain      = instatus;
++    s->drain_pts  = inpts;
++    s->drain_state = DRAIN_TIMEOUT;
++
++    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
++        s->drain_state = DRAIN_DONE;
++    }
++    else if (s->one_to_one) {
++        s->drain_state = DRAIN_LAST;
++    }
++    else if (s->has_enc_stop) {
++        struct v4l2_encoder_cmd ecmd = {
++            .cmd = V4L2_ENC_CMD_STOP
++        };
++        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
++            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
++            s->drain_state = DRAIN_EOS;
++        }
++        else {
++            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
++        }
++    }
++    return 1;
++}
++
++static int deint_v4l2m2m_activate(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared *const s = priv->shared;
++    AVFilterLink * const outlink = avctx->outputs[0];
++    AVFilterLink * const inlink = avctx->inputs[0];
++    int n = 0;
++    int cn = 99;
++    int did_something = 0;
++
++    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
++
++    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
++
++    ack_inlink(avctx, s, inlink);
++
++    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
++    {
++        AVFrame * frame = av_frame_alloc();
++        int rv;
++
++        recycle_q(&s->output);
++        n = count_enqueued(&s->output);
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
++            return AVERROR(ENOMEM);
++        }
++
++        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
++                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
++        if (rv != 0) {
++            av_frame_free(&frame);
++            if (rv == AVERROR_EOF) {
++                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
++                s->drain_state = DRAIN_DONE;
++            }
++            else if (rv == AVERROR(EAGAIN)) {
++                if (s->drain_state != DRAIN_NONE) {
++                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
++                    s->drain_state = DRAIN_DONE;
++                }
++            }
++            else {
++                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
++                return rv;
++            }
++        }
++        else {
++            frame->interlaced_frame = 0;
++            // frame is always consumed by filter_frame - even on error despite
++            // a somewhat confusing comment in the header
++            rv = ff_filter_frame(outlink, frame);
++            ++s->frames_tx;
++
++            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
++            did_something = 1;
++
++            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
++                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
++                s->drain_state = DRAIN_DONE;
++            }
++        }
++
++        cn = count_enqueued(&s->capture);
++    }
++
++    if (s->drain_state == DRAIN_DONE) {
++        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
++        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
++        return 0;
++    }
++
++    recycle_q(&s->output);
++    n = count_enqueued(&s->output);
++
++    while (n < 6 && !s->drain) {
++        AVFrame * frame;
++        int rv;
++
++        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
++            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
++            return rv;
++        }
++
++        if (frame == NULL) {
++            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
++            if (!ack_inlink(avctx, s, inlink)) {
++                ff_inlink_request_frame(inlink);
++                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
++            }
++            break;
++        }
++        ++s->frames_rx;
++
++        rv = deint_v4l2m2m_filter_frame(inlink, frame);
++        av_frame_free(&frame);
++
++        if (rv != 0)
++            return rv;
++
++        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
++        did_something = 1;
++        ++n;
++    }
++
++    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
++        ff_filter_set_ready(avctx, 1);
++        did_something = 1;
++        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
++    }
++
++    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
++    return did_something ? 0 : FFERROR_NOT_READY;
++}
++
++static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
++{
++    DeintV4L2M2MContext * const priv = avctx->priv;
++    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
++
++    if (!ctx) {
++        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
++        return AVERROR(ENOMEM);
++    }
++    priv->shared = ctx;
++    ctx->logctx = priv;
++    ctx->filter_type = filter_type;
++    ctx->fd = -1;
++    ctx->output.ctx = ctx;
++    ctx->output.num_buffers = 8;
++    ctx->output.name = "OUTPUT";
++    ctx->capture.ctx = ctx;
++    ctx->capture.num_buffers = 12;
++    ctx->capture.name = "CAPTURE";
++    ctx->done = 0;
++    ctx->field_order = V4L2_FIELD_ANY;
++
++    pts_track_init(&ctx->track, priv);
++
++    atomic_init(&ctx->refcount, 1);
++
++    if (priv->output_format_string) {
++        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
++        if (ctx->output_format == AV_PIX_FMT_NONE) {
++            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
++            return AVERROR(EINVAL);
++        }
++        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
++            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
++            return AVERROR(EINVAL);
++        }
++    } else {
++        // Use the input format once that is configured.
++        ctx->output_format = AV_PIX_FMT_NONE;
++    }
++
++#define STRING_OPTION(var_name, func_name, default_value) do { \
++        if (priv->var_name ## _string) { \
++            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
++            if (var < 0) { \
++                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
++                return AVERROR(EINVAL); \
++            } \
++            priv->var_name = var; \
++        } else { \
++            priv->var_name = default_value; \
++        } \
++    } while (0)
++
++    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
++    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
++    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
++    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
++
++    return 0;
++}
++
++static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++{
++    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
++}
++
++static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
++{
++    int rv;
++    DeintV4L2M2MContext * priv;
++    DeintV4L2M2MContextShared * ctx;
++
++    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
++        return rv;
++
++    priv = avctx->priv;
++    ctx = priv->shared;
++
++    ctx->one_to_one = 1;
++    return 0;
++}
++
++static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
++{
++    DeintV4L2M2MContext *priv = avctx->priv;
++    DeintV4L2M2MContextShared *ctx = priv->shared;
++
++    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
++           ctx->frames_rx, ctx->frames_tx);
++    ctx->done = 1;
++    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
++    pts_track_uninit(&ctx->track);
++    deint_v4l2m2m_destroy_context(ctx);
++}
++
++static const AVOption deinterlace_v4l2m2m_options[] = {
++    { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
++
++#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
++#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
++
++static const AVOption scale_v4l2m2m_options[] = {
++    { "w", "Output video width",
++      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
++    { "h", "Output video height",
++      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
++    { "format", "Output video format (software format of hardware frames)",
++      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
++      // These colour properties match the ones of the same name in vf_scale.
++      { "out_color_matrix", "Output colour matrix coefficient set",
++      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
++    { "out_range", "Output colour range",
++      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
++      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
++        { "full",    "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++        { "limited", "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "jpeg",    "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++        { "mpeg",    "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "tv",      "Limited range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
++        { "pc",      "Full range",
++          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
++    // These colour properties match the ones in the VAAPI scaler
++    { "out_color_primaries", "Output colour primaries",
++      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "out_color_transfer", "Output colour transfer characteristics",
++      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "out_chroma_location", "Output chroma sample location",
++      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
++      { .str = NULL }, .flags = FLAGS },
++    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
++    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
++    { NULL },
++};
++
++AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
++
++static const AVFilterPad deint_v4l2m2m_inputs[] = {
++    {
++        .name         = "default",
++        .type         = AVMEDIA_TYPE_VIDEO,
++    },
++    { NULL }
++};
++
++static const AVFilterPad deint_v4l2m2m_outputs[] = {
++    {
++        .name          = "default",
++        .type          = AVMEDIA_TYPE_VIDEO,
++        .config_props  = deint_v4l2m2m_config_props,
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_deinterlace_v4l2m2m = {
++    .name           = "deinterlace_v4l2m2m",
++    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
++    .priv_size      = sizeof(DeintV4L2M2MContext),
++    .init           = &deint_v4l2m2m_init,
++    .uninit         = &deint_v4l2m2m_uninit,
++    .query_formats  = &deint_v4l2m2m_query_formats,
++    .inputs         = deint_v4l2m2m_inputs,
++    .outputs        = deint_v4l2m2m_outputs,
++    .priv_class     = &deinterlace_v4l2m2m_class,
++    .activate       = deint_v4l2m2m_activate,
++};
++
++AVFilter ff_vf_scale_v4l2m2m = {
++    .name           = "scale_v4l2m2m",
++    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
++    .priv_size      = sizeof(DeintV4L2M2MContext),
++    .init           = &scale_v4l2m2m_init,
++    .uninit         = &deint_v4l2m2m_uninit,
++    .query_formats  = &deint_v4l2m2m_query_formats,
++    .inputs         = deint_v4l2m2m_inputs,
++    .outputs        = deint_v4l2m2m_outputs,
++    .priv_class     = &scale_v4l2m2m_class,
++    .activate       = deint_v4l2m2m_activate,
++};
++
+diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
+new file mode 100644
+index 0000000000..61c03a385c
+--- /dev/null
++++ b/libavfilter/vf_unsand.c
+@@ -0,0 +1,229 @@
++/*
++ * Copyright (c) 2007 Bobby Bingham
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU Lesser General Public
++ * License as published by the Free Software Foundation; either
++ * version 2.1 of the License, or (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++ * Lesser General Public License for more details.
++ *
++ * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
++ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
++ */
++
++/**
++ * @file
++ * format and noformat video filters
++ */
++
++#include <string.h>
++
++#include "libavutil/internal.h"
++#include "libavutil/mem.h"
++#include "libavutil/pixdesc.h"
++#include "libavutil/opt.h"
++#include "libavutil/rpi_sand_fns.h"
++
++#include "avfilter.h"
++#include "formats.h"
++#include "internal.h"
++#include "video.h"
++
++typedef struct UnsandContext {
++    const AVClass *class;
++} UnsandContext;
++
++static av_cold void uninit(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++}
++
++static av_cold int init(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++
++    return 0;
++}
++
++
++static int filter_frame(AVFilterLink *link, AVFrame *in)
++{
++    AVFilterLink * const outlink = link->dst->outputs[0];
++    AVFrame *out = NULL;
++    int rv = 0;
++
++    if (outlink->format == in->format) {
++        // If nothing to do then do nothing
++        out = in;
++    }
++    else
++    {
++        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
++        {
++            rv = AVERROR(ENOMEM);
++            goto fail;
++        }
++        if (av_rpi_sand_to_planar_frame(out, in) != 0)
++        {
++            rv = -1;
++            goto fail;
++        }
++
++        av_frame_free(&in);
++    }
++
++    return ff_filter_frame(outlink, out);
++
++fail:
++    av_frame_free(&out);
++    av_frame_free(&in);
++    return rv;
++}
++
++#if 0
++static void dump_fmts(const AVFilterFormats * fmts)
++{
++    int i;
++    if (fmts== NULL) {
++        printf("NULL\n");
++        return;
++    }
++    for (i = 0; i < fmts->nb_formats; ++i) {
++        printf(" %d", fmts->formats[i]);
++    }
++    printf("\n");
++}
++#endif
++
++static int query_formats(AVFilterContext *ctx)
++{
++//    UnsandContext *s = ctx->priv;
++    int ret;
++
++    // If we aren't connected at both ends then just do nothing
++    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
++        return 0;
++
++    // Our output formats depend on our input formats and we can't/don't
++    // want to convert between bit depths so we need to wait for the source
++    // to have an opinion before we do
++    if (ctx->inputs[0]->incfg.formats == NULL)
++        return AVERROR(EAGAIN);
++
++    // Accept anything
++    if (ctx->inputs[0]->outcfg.formats == NULL &&
++        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
++        return ret;
++
++    // Filter out sand formats
++
++    // Generate a container if we don't already have one
++    if (ctx->outputs[0]->incfg.formats == NULL)
++    {
++        // Somewhat rubbish way of ensuring we have a good structure
++        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
++        AVFilterFormats *formats = ff_make_format_list(out_fmts);
++
++        if (formats == NULL)
++            return AVERROR(ENOMEM);
++        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
++            return ret;
++    }
++
++    // Replace old format list with new filtered list derived from what our
++    // input says it can do
++    {
++        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
++        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
++        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
++        int i;
++        int n = 0;
++        int seen_420p = 0;
++        int seen_420p10 = 0;
++
++        for (i = 0; i < src_ff->nb_formats; ++i) {
++            const enum AVPixelFormat f = src_ff->formats[i];
++
++            switch (f){
++                case AV_PIX_FMT_YUV420P:
++                case AV_PIX_FMT_SAND128:
++                case AV_PIX_FMT_RPI4_8:
++                    if (!seen_420p) {
++                        seen_420p = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
++                    }
++                    break;
++                case AV_PIX_FMT_SAND64_10:
++                case AV_PIX_FMT_YUV420P10:
++                case AV_PIX_FMT_RPI4_10:
++                    if (!seen_420p10) {
++                        seen_420p10 = 1;
++                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
++                    }
++                    break;
++                default:
++                    dst_fmts[n++] = f;
++                    break;
++            }
++        }
++
++        av_freep(&dst_ff->formats);
++        dst_ff->formats = dst_fmts;
++        dst_ff->nb_formats = n;
++    }
++
++//    printf("Unsand: %s calc: ", __func__);
++//    dump_fmts(ctx->outputs[0]->incfg.formats);
++
++    return 0;
++}
++
++
++#define OFFSET(x) offsetof(UnsandContext, x)
++static const AVOption unsand_options[] = {
++    { NULL }
++};
++
++
++AVFILTER_DEFINE_CLASS(unsand);
++
++static const AVFilterPad avfilter_vf_unsand_inputs[] = {
++    {
++        .name             = "default",
++        .type             = AVMEDIA_TYPE_VIDEO,
++        .filter_frame = filter_frame,
++    },
++    { NULL }
++};
++
++static const AVFilterPad avfilter_vf_unsand_outputs[] = {
++    {
++        .name = "default",
++        .type = AVMEDIA_TYPE_VIDEO
++    },
++    { NULL }
++};
++
++AVFilter ff_vf_unsand = {
++    .name          = "unsand",
++    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
++
++    .init          = init,
++    .uninit        = uninit,
++
++    .query_formats = query_formats,
++
++    .priv_size     = sizeof(UnsandContext),
++    .priv_class    = &unsand_class,
++
++    .inputs        = avfilter_vf_unsand_inputs,
++    .outputs       = avfilter_vf_unsand_outputs,
++};
++
+diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
+index b1e70b3bc6..b9e3a25921 100644
+--- a/libavfilter/x86/vf_bwdif_init.c
++++ b/libavfilter/x86/vf_bwdif_init.c
+@@ -51,11 +51,9 @@ void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *ne
+                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                       int mrefs4, int parity, int clip_max);
+ 
+-av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
++av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif, int bit_depth)
+ {
+-    YADIFContext *yadif = &bwdif->yadif;
+     int cpu_flags = av_get_cpu_flags();
+-    int bit_depth = (!yadif->csp) ? 8 : yadif->csp->comp[0].depth;
+ 
+     if (bit_depth <= 8) {
+ #if ARCH_X86_32
+diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
+index b4284a8778..692265593c 100644
+--- a/libavformat/matroskaenc.c
++++ b/libavformat/matroskaenc.c
+@@ -58,6 +58,9 @@
+  * Info, Tracks, Chapters, Attachments, Tags (potentially twice) and Cues */
+ #define MAX_SEEKHEAD_ENTRIES 7
+ 
++/* Reserved size for H264 headers if not extant at init time */
++#define MAX_H264_HEADER_SIZE 1024
++
+ #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
+                               !(mkv)->is_live)
+ 
+@@ -721,8 +724,12 @@ static int mkv_write_native_codecprivate(AVFormatContext *s, AVIOContext *pb,
+     case AV_CODEC_ID_WAVPACK:
+         return put_wv_codecpriv(dyn_cp, par);
+     case AV_CODEC_ID_H264:
+-        return ff_isom_write_avcc(dyn_cp, par->extradata,
+-                                  par->extradata_size);
++        if (par->extradata_size)
++            return ff_isom_write_avcc(dyn_cp, par->extradata,
++                                      par->extradata_size);
++        else
++            put_ebml_void(pb, MAX_H264_HEADER_SIZE);
++        break;
+     case AV_CODEC_ID_HEVC:
+         return ff_isom_write_hvcc(dyn_cp, par->extradata,
+                                   par->extradata_size, 0);
+@@ -2259,7 +2266,9 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
+         break;
+     // FIXME: Remove the following once libaom starts propagating extradata during init()
+     //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
++    // H264 V4L2 has a similar issue
+     case AV_CODEC_ID_AV1:
++    case AV_CODEC_ID_H264:
+         if (side_data_size && mkv->track.bc && !par->extradata_size) {
+             AVIOContext *dyn_cp;
+             uint8_t *codecpriv;
+@@ -2267,7 +2276,10 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
+             ret = avio_open_dyn_buf(&dyn_cp);
+             if (ret < 0)
+                 return ret;
+-            ff_isom_write_av1c(dyn_cp, side_data, side_data_size);
++            if (par->codec_id == AV_CODEC_ID_H264)
++                ff_isom_write_avcc(dyn_cp, side_data, side_data_size);
++            else
++                ff_isom_write_av1c(dyn_cp, side_data, side_data_size);
+             codecpriv_size = avio_get_dyn_buf(dyn_cp, &codecpriv);
+             if ((ret = dyn_cp->error) < 0 ||
+                 !codecpriv_size && (ret = AVERROR_INVALIDDATA)) {
+@@ -2275,8 +2287,25 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
+                 return ret;
+             }
+             avio_seek(mkv->track.bc, track->codecpriv_offset, SEEK_SET);
+-            // Do not write the OBUs as we don't have space saved for them
+-            put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4);
++            if (par->codec_id == AV_CODEC_ID_H264) {
++                int filler;
++                // Up to 6 bytes for header and the filler must be at least 2
++                if (codecpriv_size > MAX_H264_HEADER_SIZE - 8) {
++                    av_log(s, AV_LOG_ERROR, "H264 header size %d > %d bytes\n", codecpriv_size, MAX_H264_HEADER_SIZE - 8);
++                    return AVERROR_INVALIDDATA;
++                }
++                put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, codecpriv_size);
++                filler = MAX_H264_HEADER_SIZE - (avio_tell(mkv->track.bc) - track->codecpriv_offset);
++                if (filler < 2) {
++                    av_log(s, AV_LOG_ERROR, "Unexpected SPS/PPS filler length: %d\n", filler);
++                    return AVERROR_BUG;
++                }
++                put_ebml_void(mkv->track.bc, filler);
++            }
++            else {
++                // Do not write the OBUs as we don't have space saved for them
++                put_ebml_binary(mkv->track.bc, MATROSKA_ID_CODECPRIVATE, codecpriv, 4);
++            }
+             ffio_free_dyn_buf(&dyn_cp);
+             ret = ff_alloc_extradata(par, side_data_size);
+             if (ret < 0)
+diff --git a/libavformat/movenc.c b/libavformat/movenc.c
+index 2cd5773dc5..0cbbc094de 100644
+--- a/libavformat/movenc.c
++++ b/libavformat/movenc.c
+@@ -5926,6 +5926,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
+     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
+             trk->par->codec_id == AV_CODEC_ID_AAC ||
+             trk->par->codec_id == AV_CODEC_ID_AV1 ||
++            trk->par->codec_id == AV_CODEC_ID_H264 ||
+             trk->par->codec_id == AV_CODEC_ID_FLAC) {
+         buffer_size_t side_size;
+         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
+diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
+index 38e4c65c4e..5e04c1df08 100644
+--- a/libavformat/rtpenc.c
++++ b/libavformat/rtpenc.c
+@@ -19,6 +19,7 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "avc.h"
+ #include "avformat.h"
+ #include "mpegts.h"
+ #include "internal.h"
+@@ -582,8 +583,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
+         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
+         break;
+     case AV_CODEC_ID_H264:
++    {
++        uint8_t *side_data;
++        int side_data_size = 0;
++
++        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
++                                            &side_data_size);
++
++        if (side_data_size != 0) {
++            int ps_size = side_data_size;
++            uint8_t * ps_buf = NULL;
++
++            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
++            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
++            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
++            av_free(ps_buf);
++        }
+         ff_rtp_send_h264_hevc(s1, pkt->data, size);
+         break;
++    }
+     case AV_CODEC_ID_H261:
+         ff_rtp_send_h261(s1, pkt->data, size);
+         break;
+diff --git a/libavformat/utils.c b/libavformat/utils.c
+index 75e5350a27..e10b493dae 100644
+--- a/libavformat/utils.c
++++ b/libavformat/utils.c
+@@ -3013,6 +3013,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
+     return 1;
+ }
+ 
++#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
++// This should be quite general purpose but avoid possible conflicts
++// by limiting usage to cases wehere we know it works.
++static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
++{
++    // Only try fallback if we know it is supported (HEVC only)
++    const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
++        avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
++    int err;
++
++    // Failed to find fallback or we are already at the fallback
++    if (new_codec == NULL || new_codec == old_codec)
++    {
++        return AVERROR_DECODER_NOT_FOUND;
++    }
++
++    // * This may be dodgy - header says to not use this fn,
++    //   especially if we are going to reopen the context...
++    //   (but it does seem to work for our cases)
++    if (avcodec_is_open(avctx)) {
++        avcodec_close(avctx);
++    }
++
++    if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
++    {
++        return err;
++    }
++
++    return 0;
++}
++#else
++#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
++#endif
++
+ /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
+ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+                             const AVPacket *avpkt, AVDictionary **options)
+@@ -3051,7 +3085,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+         av_dict_set(options ? options : &thread_opt, "lowres", "0", 0);
+         if (s->codec_whitelist)
+             av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
+-        ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
++        if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
++        {
++            // Try fallback if if looks worth a try
++            ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
++        }
+         if (!options)
+             av_dict_free(&thread_opt);
+         if (ret < 0) {
+@@ -3082,6 +3120,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st,
+         if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
+             avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
+             ret = avcodec_send_packet(avctx, &pkt);
++
++            // If we are going to want to fall back we should know here
++            if (ret == AVERROR_DECODER_NOT_FOUND) {
++                if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
++                    break;
++                continue;
++            }
++
+             if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
+                 break;
+             if (ret >= 0)
+@@ -3710,9 +3756,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
+         // Try to just open decoders, in case this is enough to get parameters.
+         if (!has_codec_parameters(st, NULL) && st->internal->request_probe <= 0) {
+             if (codec && !avctx->codec)
+-                if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
+-                    av_log(ic, AV_LOG_WARNING,
+-                           "Failed to open codec in %s\n",__FUNCTION__);
++            {
++                int err;
++
++                if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
++                {
++                    if (err == AVERROR_DECODER_NOT_FOUND) {
++                        err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
++                    }
++                    if (err < 0) {
++                        av_log(ic, AV_LOG_WARNING,
++                               "Failed to open codec in %s\n",__FUNCTION__);
++                    }
++                }
++            }
+         }
+         if (!options)
+             av_dict_free(&thread_opt);
+diff --git a/libavutil/Makefile b/libavutil/Makefile
+index 27bafe9e12..c9075ddf8a 100644
+--- a/libavutil/Makefile
++++ b/libavutil/Makefile
+@@ -68,6 +68,7 @@ HEADERS = adler32.h                                                     \
+           rational.h                                                    \
+           replaygain.h                                                  \
+           ripemd.h                                                      \
++	  rpi_sand_fns.h                                                \
+           samplefmt.h                                                   \
+           sha.h                                                         \
+           sha512.h                                                      \
+@@ -87,6 +88,7 @@ HEADERS = adler32.h                                                     \
+           film_grain_params.h                                           \
+ 
+ HEADERS-$(CONFIG_LZO)                   += lzo.h
++HEADERS-$(CONFIG-RPI)                   += rpi_sand_fn_pw.h
+ 
+ ARCH_HEADERS = bswap.h                                                  \
+                intmath.h                                                \
+@@ -182,6 +184,7 @@ OBJS-$(CONFIG_LZO)                      += lzo.o
+ OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
+ OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
+ OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
++OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
+ OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
+ OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
+ OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
+diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
+index 5613813ba8..ab8bcfcf34 100644
+--- a/libavutil/aarch64/Makefile
++++ b/libavutil/aarch64/Makefile
+@@ -1,4 +1,6 @@
+ OBJS += aarch64/cpu.o                                                 \
+         aarch64/float_dsp_init.o                                      \
+ 
+-NEON-OBJS += aarch64/float_dsp_neon.o
++NEON-OBJS += aarch64/float_dsp_neon.o                                 \
++             aarch64/rpi_sand_neon.o                                  \
++
+diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..11658de0c8
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.S
+@@ -0,0 +1,672 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#include "asm.S"
++
++// void ff_rpi_sand8_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++    // w15 contains the number of rows we need to process
++    ldr w15, [sp, #0]
++
++    // w8 will contain the number of blocks per row
++    // w8 = floor(_w/stride1)
++    // stride1 is assumed to always be 128
++    mov w8, w1
++    lsr w8, w8, #7
++
++    // in case the width of the image is not a multiple of 128, there will
++    // be an incomplete block at the end of every row
++    // w9 contains the number of pixels stored within this block
++    // w9 = _w - w8 * 128
++    lsl w9, w8, #7
++    sub w9, w7, w9
++
++    // this is the value we have to add to the src pointer after reading a complete block
++    // it will move the address to the start of the next block
++    // w10 = stride2 * stride1 - stride1 
++    mov w10, w4
++    lsl w10, w10, #7
++    sub w10, w10, #128
++
++    // w11 is the row offset, meaning the start offset of the first block of every collumn
++    // this will be increased with stride1 within every iteration of the row_loop
++    eor w11, w11, w11
++
++    // w12 = 0, processed row count
++    eor w12, w12, w12
++row_loop:
++    // start of the first block within the current row
++    // x13 = row offset + src
++    mov x13, x2
++    add x13, x13, x11
++
++    // w14 = 0, processed block count
++    eor w14, w14, w14
++
++    cmp w8, #0
++    beq no_main_y8
++
++block_loop:
++    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
++    // fortunately these aren't callee saved ones, meaning we don't need to backup them
++    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
++    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
++
++    // write these registers back to the destination vector and increase the dst address by 128
++    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
++    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
++
++    // move the source register to the beginning of the next block (x13 = src + block offset)
++    add x13, x13, x10
++    // increase the block counter
++    add w14, w14, #1
++
++    // continue with the block_loop if we haven't copied all full blocks yet
++    cmp w8, w14
++    bgt block_loop
++
++    // handle the last block at the end of each row
++    // at most 127 byte values copied from src to dst
++no_main_y8:
++    eor w5, w5, w5 // i = 0
++incomplete_block_loop_y8:
++    cmp w5, w9
++    bge incomplete_block_loop_end_y8
++
++    ldrb w6, [x13]
++    strb w6, [x0]
++    add x13, x13, #1
++    add x0, x0, #1
++
++    add w5, w5, #1
++    b incomplete_block_loop_y8
++incomplete_block_loop_end_y8:
++    
++   
++    // increase the row offset by 128 (stride1) 
++    add w11, w11, #128
++    // increment the row counter
++    add w12, w12, #1
++    
++    // process the next row if we haven't finished yet
++    cmp w15, w12
++    bgt row_loop
++
++    ret
++endfunc
++
++
++
++// void ff_rpi_sand8_lines_to_planar_c8(
++//   uint8_t * dst_u,           : x0
++//   unsigned int dst_stride_u, : w1 == width
++//   uint8_t * dst_v,           : x2
++//   unsigned int dst_stride_v, : w3 == width
++//   const uint8_t * src,       : x4
++//   unsigned int stride1,      : w5 == 128
++//   unsigned int stride2,      : w6
++//   unsigned int _x,           : w7
++//   unsigned int y,            : [sp, #0]
++//   unsigned int _w,           : [sp, #8]
++//   unsigned int h);           : [sp, #16]
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++    // w7 = width
++    ldr w7, [sp, #8]
++
++    // w15 contains the number of rows we need to process
++    // counts down
++    ldr w15, [sp, #16]
++
++    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
++    mov w8, w7
++    lsr w8, w8, #6
++
++    // number of pixels in block at the end of every row
++    // w9 = _w - (w8 * 64)
++    lsl w9, w8, #6
++    sub w9, w7, w9
++
++    // Skip at the end of the line to account for stride
++    sub w12, w1, w7
++
++    // address delta to the beginning of the next block
++    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
++    lsl w10, w6, #7
++    sub w10, w10, #128
++
++    // w11 = row address start offset = 0
++    eor w11, w11, w11
++
++row_loop_c8:
++    // start of the first block within the current row
++    // x13 = row offset + src
++    mov x13, x4
++    add x13, x13, x11
++
++    // w14 = 0, processed block count
++    eor w14, w14, w14
++
++    cmp w8, #0
++    beq no_main_c8
++
++block_loop_c8:
++    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
++    ld2 { v0.16b,  v1.16b }, [x13], #32
++    ld2 { v2.16b,  v3.16b }, [x13], #32
++    ld2 { v4.16b,  v5.16b }, [x13], #32
++    ld2 { v6.16b,  v7.16b }, [x13], #32
++
++    // swap register so that we can write them out with a single instruction
++    mov v16.16b, v1.16b
++    mov v17.16b, v3.16b
++    mov v18.16b, v5.16b
++    mov v1.16b, v2.16b
++    mov v2.16b, v4.16b
++    mov v3.16b, v6.16b
++    mov v4.16b, v16.16b
++    mov v5.16b, v17.16b
++    mov v6.16b, v18.16b
++
++    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
++    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
++
++    // increment row counter and move src to the beginning of the next block
++    add w14, w14, #1
++    add x13, x13, x10
++    
++    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
++    cmp w8, w14
++    bgt block_loop_c8
++
++no_main_c8:
++    // handle incomplete block at the end of every row
++    eor w5, w5, w5 // point counter, this might be 
++incomplete_block_loop_c8:
++    cmp w5, w9
++    bge incomplete_block_loop_end_c8
++
++    ldrb w1, [x13]
++    strb w1, [x0]
++    add x13, x13, #1
++
++    ldrb w1, [x13]
++    strb w1, [x2]
++    add x13, x13, #1
++
++    add x0, x0, #1
++    add x2, x2, #1
++
++    add w5, w5, #1
++    b incomplete_block_loop_c8
++incomplete_block_loop_end_c8:
++
++    // increase row_offset by stride1
++    add w11, w11, #128
++    add x0, x0, w12, sxtw
++    add x2, x2, w12, sxtw
++
++    // jump to row_Loop_c8 iff the row count is small than the height
++    subs w15, w15, #1
++    bgt row_loop_c8
++
++    ret
++endfunc
++
++// Unzip chroma
++//
++// On entry:
++// a0 = V0, U2,  ...
++// a1 = U0, V1,  ...
++// a2 = U1, V2,  ...
++// b0 = V8, U10, ...
++// b1 = U8, V9,  ...
++// b2 = U9, V10, ...
++//
++// On exit:
++// d0 = U0, U3, ...
++// ...
++// a0 = V0, V3, ..
++// ...
++//
++// Reg order for USAND is a1, a0, a2 (i.e. swap natural order of 1st 2 dest regs)
++
++.macro UZPH_C d0, d1, d2, a0, a1, a2, b0, b1, b2
++                uzp1            \d0\().8h, \a1\().8h, \b1\().8h
++                uzp1            \d1\().8h, \a2\().8h, \b2\().8h
++                uzp2            \d2\().8h, \a0\().8h, \b0\().8h
++
++                uzp1            \a0\().8h, \a0\().8h, \b0\().8h
++                uzp2            \a1\().8h, \a1\().8h, \b1\().8h
++                uzp2            \a2\().8h, \a2\().8h, \b2\().8h
++.endm
++
++// SAND30 -> 10bit
++.macro USAND10 d0, d1, d2, a0, a1
++                shrn            \d2\().4h, \a0\().4s, #14
++                shrn            \d1\().4h, \a0\().4s, #10
++
++                shrn2           \d2\().8h, \a1\().4s, #14
++                shrn2           \d1\().8h, \a1\().4s, #10
++                uzp1            \d0\().8h, \a0\().8h, \a1\().8h
++
++                ushr            \d2\().8h, \d2\().8h, #6
++                bic             \d0\().8h, #0xfc,     lsl #8
++                bic             \d1\().8h, #0xfc,     lsl #8
++.endm
++
++// SAND30 -> 8bit
++.macro USAND8 d0, d1, d2, a0, a1, a2, a3, t0, t1, t2
++                shrn            \d1\().4h,  \a0\().4s,  #12
++                shrn2           \d1\().8h,  \a1\().4s,  #12
++                uzp1            \d0\().8h,  \a0\().8h,  \a1\().8h
++                uzp2            \d2\().8h,  \a0\().8h,  \a1\().8h
++
++                shrn            \t1\().4h,  \a2\().4s,  #12
++                shrn2           \t1\().8h,  \a3\().4s,  #12
++                uzp1            \t0\().8h,  \a2\().8h,  \a3\().8h
++                uzp2            \t2\().8h,  \a2\().8h,  \a3\().8h
++
++                shrn            \d0\().8b,  \d0\().8h,  #2
++                shrn2           \d0\().16b, \t0\().8h,  #2
++                shrn            \d2\().8b,  \d2\().8h,  #6
++                shrn2           \d2\().16b, \t2\().8h,  #6
++                uzp1            \d1\().16b, \d1\().16b, \t1\().16b
++.endm
++
++
++// void ff_rpi_sand30_lines_to_planar_c16(
++//   uint8_t * dst_u,            // [x0]
++//   unsigned int dst_stride_u,  // [w1]
++//   uint8_t * dst_v,            // [x2]
++//   unsigned int dst_stride_v,  // [w3]
++//   const uint8_t * src,        // [x4]
++//   unsigned int stride1,       // [w5]      128
++//   unsigned int stride2,       // [w6]
++//   unsigned int _x,            // [w7]      0
++//   unsigned int y,             // [sp, #0]
++//   unsigned int _w,            // [sp, #8]  w9
++//   unsigned int h);            // [sp, #16] w10
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++                ldr             w7,  [sp, #0]                   // y
++                ldr             w8,  [sp, #8]                   // _w
++                ldr             w10, [sp, #16]                  // h
++                lsl             w6,  w6,  #7                    // Fixup stride2
++                sub             w6,  w6,  #64
++                uxtw            x6,  w6
++                sub             w1,  w1,  w8,  LSL #1           // Fixup chroma strides
++                sub             w3,  w3,  w8,  LSL #1
++                lsl             w7,  w7,  #7                    // Add y to src
++                add             x4,  x4,  w7,  UXTW
++10:
++                mov             w13, #0
++                mov             x5,  x4
++                mov             w9,  w8
++1:
++                ld1             {v0.4s-v3.4s}, [x5], #64
++                ld1             {v4.4s-v7.4s}, [x5], x6
++                subs            w9,  w9,  #48
++
++                USAND10         v17, v16, v18, v0, v1
++                USAND10         v20, v19, v21, v2, v3
++                UZPH_C          v0, v1, v2, v16, v17, v18, v19, v20, v21
++                USAND10         v23, v22, v24, v4, v5
++                USAND10         v26, v25, v27, v6, v7
++                UZPH_C          v4, v5, v6, v22, v23, v24, v25, v26, v27
++
++                blt             2f
++
++                st3             {v0.8h-v2.8h},   [x0], #48
++                st3             {v4.8h-v6.8h},   [x0], #48
++                st3             {v16.8h-v18.8h}, [x2], #48
++                st3             {v22.8h-v24.8h}, [x2], #48
++
++                bne             1b
++11:
++                subs            w10, w10, #1
++                add             x4,  x4,  #128
++                add             x0,  x0,  w1,  UXTW
++                add             x2,  x2,  w3,  UXTW
++                bne             10b
++99:
++                ret
++
++// Partial final write
++2:
++                cmp             w9,  #24-48
++                blt             1f
++                st3             {v0.8h  - v2.8h},  [x0], #48
++                st3             {v16.8h - v18.8h}, [x2], #48
++                beq             11b
++                mov             v0.16b,  v4.16b
++                mov             v1.16b,  v5.16b
++                sub             w9,  w9,  #24
++                mov             v2.16b,  v6.16b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                mov             v18.16b, v24.16b
++1:
++                cmp             w9,  #12-48
++                blt             1f
++                st3             {v0.4h  - v2.4h},  [x0], #24
++                st3             {v16.4h - v18.4h}, [x2], #24
++                beq             11b
++                mov             v0.2d[0],  v0.2d[1]
++                sub             w9,  w9,  #12
++                mov             v1.2d[0],  v1.2d[1]
++                mov             v2.2d[0],  v2.2d[1]
++                mov             v16.2d[0], v16.2d[1]
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w9,  #6-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v0.h  - v2.h}[1],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                st3             {v16.h - v18.h}[1], [x2], #6
++                beq             11b
++                mov             v0.s[0],  v0.s[1]
++                sub             w9,  w9,  #6
++                mov             v1.s[0],  v1.s[1]
++                mov             v2.s[0],  v2.s[1]
++                mov             v16.s[0], v16.s[1]
++                mov             v17.s[0], v17.s[1]
++                mov             v18.s[0], v18.s[1]
++1:
++                cmp             w9,  #3-48
++                blt             1f
++                st3             {v0.h  - v2.h}[0],  [x0], #6
++                st3             {v16.h - v18.h}[0], [x2], #6
++                beq             11b
++                mov             v0.h[0],  v0.h[1]
++                sub             w9,  w9,  #3
++                mov             v1.h[0],  v1.h[1]
++                mov             v16.h[0], v16.h[1]
++                mov             v17.h[0], v17.h[1]
++1:
++                cmp             w9,  #2-48
++                blt             1f
++                st2             {v0.h  - v1.h}[0],  [x0], #4
++                st2             {v16.h - v17.h}[0], [x2], #4
++                b               11b
++1:
++                st1             {v0.h}[0],  [x0], #2
++                st1             {v16.h}[0], [x2], #2
++                b               11b
++endfunc
++
++
++//void ff_rpi_sand30_lines_to_planar_p010(
++//  uint8_t * dest,
++//  unsigned int dst_stride,
++//  const uint8_t * src,
++//  unsigned int src_stride1,
++//  unsigned int src_stride2,
++//  unsigned int _x,
++//  unsigned int y,
++//  unsigned int _w,
++//  unsigned int h);
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                uxtw            x4,  w4
++                sub             w1,  w1,  w7, lsl #1
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                USAND10         v16, v17, v18, v0, v1
++                USAND10         v19, v20, v21, v2, v3
++                USAND10         v22, v23, v24, v4, v5
++                USAND10         v25, v26, v27, v6, v7
++
++                blt             2f
++
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
++                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++                mov             v19.16b, v25.16b
++                mov             v20.16b, v26.16b
++                mov             v21.16b, v27.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
++                beq             11b
++                mov             v16.16b, v19.16b
++                mov             v17.16b, v20.16b
++                sub             w5,  w5,  #24
++                mov             v18.16b, v21.16b
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #12
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #6
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #3
++                mov             v17.4h[0], v17.4h[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.h, v17.h}[0], [x0], #4
++                b               11b
++1:
++                st1             {v16.h}[0], [x0], #2
++                b               11b
++
++endfunc
++
++// void ff_rpi_sand30_lines_to_planar_y8(
++//   uint8_t * dest,            : x0
++//   unsigned int dst_stride,   : w1
++//   const uint8_t * src,       : x2
++//   unsigned int src_stride1,  : w3, always 128
++//   unsigned int src_stride2,  : w4
++//   unsigned int _x,           : w5
++//   unsigned int y,            : w6
++//   unsigned int _w,           : w7
++//   unsigned int h);           : [sp, #0]
++//
++// Assumes that we are starting on a stripe boundary and that overreading
++// within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                lsl             w4,  w4,  #7
++                sub             w4,  w4,  #64
++                uxtw            x4,  w4
++                sub             w1,  w1,  w7
++                uxtw            x6,  w6
++                add             x8,  x2,  x6, lsl #7
++                ldr             w6,  [sp, #0]
++
++10:
++                mov             x2,  x8
++                mov             w5,  w7
++1:
++                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
++                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
++
++                subs            w5,  w5,  #96
++
++                // v0, v1
++                USAND8          v16, v17, v18, v0, v1, v2, v3, v22, v23, v24
++                USAND8          v19, v20, v21, v4, v5, v6, v7, v22, v23, v24
++
++                blt             2f
++
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
++
++                bne             1b
++
++11:
++                subs            w6,  w6,  #1
++                add             x0,  x0,  w1,  uxtw
++                add             x8,  x8,  #128
++                bne             10b
++
++                ret
++
++// Partial final write
++2:
++                cmp             w5,  #48-96
++                blt             1f
++                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
++                beq             11b
++                mov             v16.16b, v22.16b
++                mov             v17.16b, v23.16b
++                sub             w5,  w5,  #48
++                mov             v18.16b, v24.16b
++1:
++                cmp             w5,  #24-96
++                blt             1f
++                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
++                beq             11b
++                mov             v16.2d[0], v16.2d[1]
++                sub             w5,  w5,  #24
++                mov             v17.2d[0], v17.2d[1]
++                mov             v18.2d[0], v18.2d[1]
++1:
++                cmp             w5,  #12-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
++                beq             11b
++                mov             v16.2s[0], v16.2s[1]
++                sub             w5,  w5,  #12
++                mov             v17.2s[0], v17.2s[1]
++                mov             v18.2s[0], v18.2s[1]
++1:
++                cmp             w5,  #6-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
++                beq             11b
++                mov             v16.4h[0], v16.4h[1]
++                sub             w5,  w5,  #6
++                mov             v17.4h[0], v17.4h[1]
++                mov             v18.4h[0], v18.4h[1]
++1:
++                cmp             w5,  #3-96
++                blt             1f
++                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
++                beq             11b
++                mov             v16.8b[0], v16.8b[1]
++                sub             w5,  w5,  #3
++                mov             v17.8b[0], v17.8b[1]
++1:
++                cmp             w5,  #2-96
++                blt             1f
++                st2             {v16.b, v17.b}[0], [x0], #2
++                b               11b
++1:
++                st1             {v16.b}[0], [x0], #1
++                b               11b
++
++endfunc
++
+diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
+new file mode 100644
+index 0000000000..2a56135bc3
+--- /dev/null
++++ b/libavutil/aarch64/rpi_sand_neon.h
+@@ -0,0 +1,59 @@
++/*
++Copyright (c) 2021 Michael Eiler
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: Michael Eiler <eiler.mike@gmail.com>
++*/
++
++#pragma once
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
++  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
++  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
++  unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
++  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
++  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
++  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
++  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
++
++#ifdef __cplusplus
++}
++#endif
++
+diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
+index 5da44b0542..b74b7c4e2f 100644
+--- a/libavutil/arm/Makefile
++++ b/libavutil/arm/Makefile
+@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+ 
+ NEON-OBJS += arm/float_dsp_init_neon.o                                  \
+              arm/float_dsp_neon.o                                       \
++             arm/rpi_sand_neon.o                                        \
+diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
+new file mode 100644
+index 0000000000..60e697f681
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.S
+@@ -0,0 +1,925 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "libavutil/arm/asm.S"
++
++
++@ General notes:
++@ Having done some timing on this in sand8->y8 (Pi4)
++@  vst1 (680fps) is a bit faster than vstm (660fps)
++@  vldm (680fps) is noticably faster than vld1 (480fps)
++@  (or it might be that a mix is what is required)
++@
++@ At least on a Pi4 it is no more expensive to have a single auto-inc register
++@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
++@ the latter was better)
++@
++@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
++@ the memory is uncached.
++@ As these are Sand -> planar we can assume that src is going to be aligned but
++@ it is possible that dest isn't (converting to .yuv or other packed format).
++@ Luckily vst1 is faster than vstm :-) so all is well
++@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
++@ .8 stores would let us do non-word aligned stores into uncached but it
++@ probably isn't worth it.
++
++
++
++
++@ void ff_rpi_sand128b_stripe_to_8_10(
++@   uint8_t * dest,             // [r0]
++@   const uint8_t * src1,       // [r1]
++@   const uint8_t * src2,       // [r2]
++@   unsigned int lines);        // [r3]
++
++.macro  stripe2_to_8, bit_depth
++        vpush    {q4-q7}
++1:
++        vldm     r1!, {q0-q7}
++        subs     r3, #1
++        vldm     r2!, {q8-q15}
++        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
++        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
++        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
++        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
++        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
++        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
++        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
++        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
++        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
++        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
++        vqrshrn.u16 d10, q10, #\bit_depth - 8
++        vqrshrn.u16 d11, q11, #\bit_depth - 8
++        vqrshrn.u16 d12, q12, #\bit_depth - 8
++        vqrshrn.u16 d13, q13, #\bit_depth - 8
++        vqrshrn.u16 d14, q14, #\bit_depth - 8
++        vqrshrn.u16 d15, q15, #\bit_depth - 8
++        vstm     r0!, {q0-q7}
++        bne      1b
++        vpop     {q4-q7}
++        bx       lr
++.endm
++
++function ff_rpi_sand128b_stripe_to_8_10, export=1
++        stripe2_to_8     10
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24            L
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                lsl             r3,  #7
++                sub             r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2,  {q8-q15}
++                add             r2,  r3
++                subs            r5,  #128
++                blt             2f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d20, d21, d22, d23}, [r0]!
++                vst1.8          {d24, d25, d26, d27}, [r0]!
++                vst1.8          {d28, d29, d30, d31}, [r0]!
++                bne             1b
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #64-128
++                blt             1f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d20, d21, d22, d23}, [r0]!
++                beq             11b
++                vmov            q8,  q12
++                vmov            q9,  q13
++                sub             r5,  #64
++                vmov            q10, q14
++                vmov            q11, q15
++1:
++                cmp             r5,  #32-128
++                blt             1f
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                beq             11b
++                vmov            q8,  q10
++                sub             r5,  #32
++                vmov            q9,  q11
++1:
++                cmp             r5,  #16-128
++                blt             1f
++                vst1.8          {d16, d17}, [r0]!
++                beq             11b
++                sub             r5,  #16
++                vmov            q8,  q9
++1:
++                cmp             r5,  #8-128
++                blt             1f
++                vst1.8          {d16}, [r0]!
++                beq             11b
++                sub             r5,  #8
++                vmov            d16, d17
++1:
++                cmp             r5,  #4-128
++                blt             1f
++                vst1.32         {d16[0]}, [r0]!
++                beq             11b
++                sub             r5,  #4
++                vshr.u64        d16, #32
++1:
++                cmp             r5,  #2-128
++                blt             1f
++                vst1.16         {d16[0]}, [r0]!
++                beq             11b
++                vst1.8          {d16[2]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d16[0]}, [r0]!
++                b               11b
++endfunc
++
++@ void ff_rpi_sand8_lines_to_planar_c8(
++@   uint8_t * dst_u,            // [r0]
++@   unsigned int dst_stride_u,  // [r1]
++@   uint8_t * dst_v,            // [r2]
++@   unsigned int dst_stride_v,  // [r3]
++@   const uint8_t * src,        // [sp, #0]  -> r4, r5
++@   unsigned int stride1,       // [sp, #4]  128
++@   unsigned int stride2,       // [sp, #8]  -> r8
++@   unsigned int _x,            // [sp, #12] 0
++@   unsigned int y,             // [sp, #16] (r7 in prefix)
++@   unsigned int _w,            // [sp, #20] -> r12, r6
++@   unsigned int h);            // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand8_lines_to_planar_c8, export=1
++                push            {r4-r8, lr}     @ +24
++
++                ldr             r5,  [sp, #24]
++                ldr             r8,  [sp, #32]
++                ldr             r7,  [sp, #40]
++                ldr             r6,  [sp, #44]
++                lsl             r8,  #7
++                add             r5,  r5,  r7,  lsl #7
++                sub             r1,  r1,  r6
++                sub             r3,  r3,  r6
++                ldr             r7,  [sp, #48]
++                vpush           {q4-q7}
++
++10:
++                mov             r4,  r5
++                mov             r12, r6
++1:
++                subs            r12, #64
++                vldm            r4,  {q0-q7}
++                add             r4,  r8
++                it              gt
++                vldmgt          r4,  {q8-q15}
++                add             r4,  r8
++
++                vuzp.8          q0,  q1
++                vuzp.8          q2,  q3
++                vuzp.8          q4,  q5
++                vuzp.8          q6,  q7
++
++                vuzp.8          q8,  q9
++                vuzp.8          q10, q11
++                vuzp.8          q12, q13
++                vuzp.8          q14, q15
++                subs            r12, #64
++
++                @ Rearrange regs so we can use vst1 with 4 regs
++                vswp            q1,  q2
++                vswp            q5,  q6
++                vswp            q9,  q10
++                vswp            q13, q14
++                blt             2f
++
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d8,  d9,  d10, d11}, [r0]!
++                vst1.8          {d16, d17, d18, d19}, [r0]!
++                vst1.8          {d24, d25, d26, d27}, [r0]!
++
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                vst1.8          {d12, d13, d14, d15}, [r2]!
++                vst1.8          {d20, d21, d22, d23}, [r2]!
++                vst1.8          {d28, d29, d30, d31}, [r2]!
++                bne             1b
++11:
++                subs            r7,  #1
++                add             r5,  #128
++                add             r0,  r1
++                add             r2,  r3
++                bne             10b
++                vpop            {q4-q7}
++                pop             {r4-r8,pc}
++
++2:
++                cmp             r12, #64-128
++                blt             1f
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d8,  d9,  d10, d11}, [r0]!
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                vst1.8          {d12, d13, d14, d15}, [r2]!
++                beq             11b
++                sub             r12, #64
++                vmov            q0,  q8
++                vmov            q1,  q9
++                vmov            q2,  q10
++                vmov            q3,  q11
++                vmov            q4,  q12
++                vmov            q5,  q13
++                vmov            q6,  q14
++                vmov            q7,  q15
++1:
++                cmp             r12, #32-128
++                blt             1f
++                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
++                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
++                beq             11b
++                sub             r12, #32
++                vmov            q0,  q4
++                vmov            q1,  q5
++                vmov            q2,  q6
++                vmov            q3,  q7
++1:
++                cmp             r12, #16-128
++                blt             1f
++                vst1.8          {d0,  d1 }, [r0]!
++                vst1.8          {d4,  d5 }, [r2]!
++                beq             11b
++                sub             r12, #16
++                vmov            q0,  q1
++                vmov            q2,  q3
++1:
++                cmp             r12, #8-128
++                blt             1f
++                vst1.8          {d0}, [r0]!
++                vst1.8          {d4}, [r2]!
++                beq             11b
++                sub             r12, #8
++                vmov            d0,  d1
++                vmov            d4,  d5
++1:
++                cmp             r12, #4-128
++                blt             1f
++                vst1.32         {d0[0]}, [r0]!
++                vst1.32         {d4[0]}, [r2]!
++                beq             11b
++                sub             r12, #4
++                vmov            s0,  s1
++                vmov            s8,  s9
++1:
++                cmp             r12, #2-128
++                blt             1f
++                vst1.16         {d0[0]}, [r0]!
++                vst1.16         {d4[0]}, [r2]!
++                beq             11b
++                vst1.8          {d0[2]}, [r0]!
++                vst1.8          {d4[2]}, [r2]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                vst1.8          {d4[0]}, [r2]!
++                b               11b
++endfunc
++
++
++
++@ void ff_rpi_sand30_lines_to_planar_y16(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_y16, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                sub             r3,  #1
++                lsl             r3,  #7
++                sub             r1,  r1,  r6,  lsl #1
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2!, {q10-q13}
++                add             lr,  #64
++
++                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
++                ands            lr,  #127
++                vshrn.u32       d2,  q10, #10
++                vmovn.u32       d0,  q10
++
++                vshrn.u32       d5,  q11, #14
++                it              eq
++                addeq           r2,  r3
++                vshrn.u32       d3,  q11, #10
++                vmovn.u32       d1,  q11
++
++                subs            r5,  #48
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
++
++                vshrn.u32       d20, q12, #14
++                vshrn.u32       d18, q12, #10
++                vmovn.u32       d16, q12
++
++                vshrn.u32       d21, q13, #14
++                vshrn.u32       d19, q13, #10
++                vmovn.u32       d17, q13
++
++                vshr.u16        q10, #6
++                vbic.u16        q8,  #0xfc00
++                vbic.u16        q9 , #0xfc00
++                blt             2f
++
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4], r12
++                vst3.16         {d16, d18, d20}, [r0], r12
++                vst3.16         {d17, d19, d21}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #24-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4]
++                beq             11b
++                vmov            q0,  q8
++                sub             r5,  #24
++                vmov            q1,  q9
++                vmov            q2,  q10
++1:
++                cmp             r5,  #12-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0]!
++                beq             11b
++                vmov            d0, d1
++                sub             r5, #12
++                vmov            d2, d3
++                vmov            d4, d5
++1:
++                cmp             r5,  #6-48
++                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
++                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
++                add             r0,  #12
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #6
++                vmov            s4,  s5
++                vmov            s8,  s9
++1:
++                cmp             r5, #3-48
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #16
++                vshr.u32        d2, #16
++1:
++                cmp             r5, #2-48
++                blt             1f
++                vst2.16         {d0[0], d2[0]}, [r0]!
++                b               11b
++1:
++                vst1.16         {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_c16(
++@   uint8_t * dst_u,            // [r0]
++@   unsigned int dst_stride_u,  // [r1]
++@   uint8_t * dst_v,            // [r2]
++@   unsigned int dst_stride_v,  // [r3]
++@   const uint8_t * src,        // [sp, #0]  -> r4, r5
++@   unsigned int stride1,       // [sp, #4]  128
++@   unsigned int stride2,       // [sp, #8]  -> r8
++@   unsigned int _x,            // [sp, #12] 0
++@   unsigned int y,             // [sp, #16] (r7 in prefix)
++@   unsigned int _w,            // [sp, #20] -> r6, r9
++@   unsigned int h);            // [sp, #24] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_c16, export=1
++                push            {r4-r10, lr}    @ +32
++                ldr             r5,  [sp, #32]
++                ldr             r8,  [sp, #40]
++                ldr             r7,  [sp, #48]
++                ldr             r9,  [sp, #52]
++                mov             r12, #48
++                sub             r8,  #1
++                lsl             r8,  #7
++                add             r5,  r5,  r7,  lsl #7
++                sub             r1,  r1,  r9,  lsl #1
++                sub             r3,  r3,  r9,  lsl #1
++                ldr             r7,  [sp, #56]
++10:
++                mov             lr,  #0
++                mov             r4,  r5
++                mov             r6,  r9
++1:
++                vldm            r4!, {q0-q3}
++                add             lr,  #64
++
++                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
++                vshrn.u32       d20, q0,  #14
++                vmovn.u32       d18, q0
++                vshrn.u32       d0,  q0,  #10
++                ands            lr,  #127
++
++                vshrn.u32       d21, q1,  #14
++                vmovn.u32       d19, q1
++                vshrn.u32       d1,  q1,  #10
++
++                vshrn.u32       d22, q2,  #10
++                vmovn.u32       d2,  q2
++                vshrn.u32       d4,  q2,  #14
++
++                add             r10, r0,  #24
++                vshrn.u32       d23, q3,  #10
++                vmovn.u32       d3,  q3
++                vshrn.u32       d5,  q3,  #14
++
++                it              eq
++                addeq           r4,  r8
++                vuzp.16         q0,  q11
++                vuzp.16         q9,  q1
++                vuzp.16         q10, q2
++
++                @ q0   V0, V3,..
++                @ q9   U0, U3...
++                @ q10  U1, U4...
++                @ q11  U2, U5,..
++                @ q1   V1, V4,
++                @ q2   V2, V5,..
++
++                subs            r6,  #24
++                vbic.u16        q11, #0xfc00
++                vbic.u16        q9,  #0xfc00
++                vshr.u16        q10, #6
++                vshr.u16        q2,  #6
++                vbic.u16        q0,  #0xfc00
++                vbic.u16        q1,  #0xfc00
++
++                blt             2f
++
++                vst3.16         {d18, d20, d22}, [r0],  r12
++                vst3.16         {d19, d21, d23}, [r10]
++                add             r10, r2,  #24
++                vst3.16         {d0,  d2,  d4},  [r2],  r12
++                vst3.16         {d1,  d3,  d5},  [r10]
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r5,  #128
++                add             r0,  r1
++                add             r2,  r3
++                bne             10b
++
++                pop             {r4-r10, pc}
++
++@ Partial final write
++2:
++                cmp             r6,  #-12
++                blt             1f
++                vst3.16         {d18, d20, d22}, [r0]!
++                vst3.16         {d0,  d2,  d4},  [r2]!
++                beq             11b
++                vmov            d18, d19
++                vmov            d20, d21
++                vmov            d22, d23
++                sub             r6,  #12
++                vmov            d0,  d1
++                vmov            d2,  d3
++                vmov            d4,  d5
++1:
++                cmp             r6,  #-18
++                @ Rezip here as it makes the remaining tail handling easier
++                vzip.16         d0,  d18
++                vzip.16         d2,  d20
++                vzip.16         d4,  d22
++                blt             1f
++                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
++                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
++                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
++                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
++                beq             11b
++                vmov            d0,  d18
++                vmov            d2,  d20
++                sub             r6,  #6
++                vmov            d4,  d22
++1:
++                cmp             r6,  #-21
++                blt             1f
++                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
++                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
++                beq             11b
++                vmov            s4,  s5
++                sub             r6,  #3
++                vmov            s0,  s1
++1:
++                cmp             r6,  #-22
++                blt             1f
++                vst2.16         {d0[1], d2[1]}, [r0]!
++                vst2.16         {d0[0], d2[0]}, [r2]!
++                b               11b
++1:
++                vst1.16         {d0[1]}, [r0]!
++                vst1.16         {d0[0]}, [r2]!
++                b               11b
++
++endfunc
++
++@ void ff_rpi_sand30_lines_to_planar_p010(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for writing
++
++function ff_rpi_sand30_lines_to_planar_p010, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                vmov.u16        q15, #0xffc0
++                sub             r3,  #1
++                lsl             r3,  #7
++                sub             r1,  r1,  r6,  lsl #1
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++                mov             lr,  #0
++1:
++                vldm            r2!, {q10-q13}
++                add             lr,  #64
++
++                vshl.u32        q14, q10, #6
++                ands            lr,  #127
++                vshrn.u32       d4,  q10, #14
++                vshrn.u32       d2,  q10, #4
++                vmovn.u32       d0,  q14
++
++                vshl.u32        q14, q11, #6
++                it              eq
++                addeq           r2,  r3
++                vshrn.u32       d5,  q11, #14
++                vshrn.u32       d3,  q11, #4
++                vmovn.u32       d1,  q14
++
++                subs            r5,  #48
++                vand            q2,  q15
++                vand            q1,  q15
++                vand            q0,  q15
++
++                vshl.u32        q14, q12, #6
++                vshrn.u32       d20, q12, #14
++                vshrn.u32       d18, q12, #4
++                vmovn.u32       d16, q14
++
++                vshl.u32        q14, q13, #6
++                vshrn.u32       d21, q13, #14
++                vshrn.u32       d19, q13, #4
++                vmovn.u32       d17, q14
++
++                vand            q10, q15
++                vand            q9,  q15
++                vand            q8,  q15
++                blt             2f
++
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4], r12
++                vst3.16         {d16, d18, d20}, [r0], r12
++                vst3.16         {d17, d19, d21}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #24-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0], r12
++                vst3.16         {d1,  d3,  d5},  [r4]
++                beq             11b
++                vmov            q0,  q8
++                sub             r5,  #24
++                vmov            q1,  q9
++                vmov            q2,  q10
++1:
++                cmp             r5,  #12-48
++                blt             1f
++                vst3.16         {d0,  d2,  d4},  [r0]!
++                beq             11b
++                vmov            d0, d1
++                sub             r5, #12
++                vmov            d2, d3
++                vmov            d4, d5
++1:
++                cmp             r5,  #6-48
++                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
++                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
++                add             r0,  #12
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #6
++                vmov            s4,  s5
++                vmov            s8,  s9
++1:
++                cmp             r5, #3-48
++                blt             1f
++                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #16
++                vshr.u32        d2, #16
++1:
++                cmp             r5, #2-48
++                blt             1f
++                vst2.16         {d0[0], d2[0]}, [r0]!
++                b               11b
++1:
++                vst1.16         {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
++@ void ff_rpi_sand30_lines_to_planar_y8(
++@   uint8_t * dest,             // [r0]
++@   unsigned int dst_stride,    // [r1]
++@   const uint8_t * src,        // [r2]
++@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++@   unsigned int src_stride2,   // [sp, #0]  -> r3
++@   unsigned int _x,            // [sp, #4]  Ignored - 0
++@   unsigned int y,             // [sp, #8]  (r7 in prefix)
++@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++@   unsigned int h);            // [sp, #16] -> r7
++@
++@ Assumes that we are starting on a stripe boundary and that overreading
++@ within the stripe is OK. However it does respect the dest size for wri
++
++function ff_rpi_sand30_lines_to_planar_y8, export=1
++                push            {r4-r8, lr}     @ +24
++                ldr             r3,  [sp, #24]
++                ldr             r6,  [sp, #36]
++                ldr             r7,  [sp, #32]  @ y
++                mov             r12, #48
++                lsl             r3,  #7
++                sub             r1,  r1,  r6
++                add             r8,  r2,  r7,  lsl #7
++                ldr             r7,  [sp, #40]
++
++10:
++                mov             r2,  r8
++                add             r4,  r0,  #24
++                mov             r5,  r6
++1:
++                vldm            r2,  {q8-q15}
++
++                subs            r5,  #96
++
++                vmovn.u32       d0,  q8
++                vshrn.u32       d2,  q8,  #12
++                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
++
++                add             r2,  r3
++
++                vmovn.u32       d1,  q9
++                vshrn.u32       d3,  q9,  #12
++                vshrn.u32       d5,  q9,  #16
++
++                pld             [r2, #0]
++
++                vshrn.u16       d0,  q0,  #2
++                vmovn.u16       d1,  q1
++                vshrn.u16       d2,  q2,  #6
++
++                vmovn.u32       d16, q10
++                vshrn.u32       d18, q10, #12
++                vshrn.u32       d20, q10, #16
++
++                vmovn.u32       d17, q11
++                vshrn.u32       d19, q11, #12
++                vshrn.u32       d21, q11, #16
++
++                pld             [r2, #64]
++
++                vshrn.u16       d4,  q8,  #2
++                vmovn.u16       d5,  q9
++                vshrn.u16       d6,  q10, #6
++
++                vmovn.u32       d16, q12
++                vshrn.u32       d18, q12, #12
++                vshrn.u32       d20, q12, #16
++
++                vmovn.u32       d17, q13
++                vshrn.u32       d19, q13, #12
++                vshrn.u32       d21, q13, #16
++
++                vshrn.u16       d16, q8,  #2
++                vmovn.u16       d17, q9
++                vshrn.u16       d18, q10, #6
++
++                vmovn.u32       d20, q14
++                vshrn.u32       d22, q14, #12
++                vshrn.u32       d24, q14, #16
++
++                vmovn.u32       d21, q15
++                vshrn.u32       d23, q15, #12
++                vshrn.u32       d25, q15, #16
++
++                vshrn.u16       d20, q10, #2
++                vmovn.u16       d21, q11
++                vshrn.u16       d22, q12, #6
++
++                blt             2f
++
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                vst3.8          {d16, d17, d18}, [r0], r12
++                vst3.8          {d20, d21, d22}, [r4], r12
++
++                bne             1b
++
++11:
++                subs            r7,  #1
++                add             r0,  r1
++                add             r8,  #128
++                bne             10b
++
++                pop             {r4-r8, pc}
++
++@ Partial final write
++2:
++                cmp             r5,  #48-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0], r12
++                vst3.8          {d4,  d5,  d6},  [r4], r12
++                beq             11b
++                vmov            q0,  q8
++                vmov            q2,  q10
++                sub             r5,  #48
++                vmov            d2,  d18
++                vmov            d6,  d22
++1:
++                cmp             r5,  #24-96
++                blt             1f
++                vst3.8          {d0,  d1,  d2},  [r0]!
++                beq             11b
++                vmov            q0,  q2
++                sub             r5,  #24
++                vmov            d2,  d6
++1:
++                cmp             r5,  #12-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
++                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
++                beq             11b
++                vmov            s0,  s1
++                sub             r5,  #12
++                vmov            s2,  s3
++                vmov            s4,  s5
++1:
++                cmp             r5,  #6-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
++                add             r0,  #12
++                beq             11b
++                vshr.u32        d0,  #16
++                sub             r5,  #6
++                vshr.u32        d1,  #16
++                vshr.u32        d2,  #16
++1:
++                cmp             r5, #3-96
++                blt             1f
++                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
++                beq             11b
++                sub             r5, #3
++                vshr.u32        d0, #8
++                vshr.u32        d1, #8
++1:
++                cmp             r5, #2-96
++                blt             1f
++                vst2.8          {d0[0], d1[0]}, [r0]!
++                b               11b
++1:
++                vst1.8          {d0[0]}, [r0]!
++                b               11b
++
++endfunc
++
++
+diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
+new file mode 100644
+index 0000000000..d457c10870
+--- /dev/null
++++ b/libavutil/arm/rpi_sand_neon.h
+@@ -0,0 +1,110 @@
++/*
++Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_ARM_SAND_NEON_H
++#define AVUTIL_ARM_SAND_NEON_H
++
++void ff_rpi_sand128b_stripe_to_8_10(
++  uint8_t * dest,             // [r0]
++  const uint8_t * src1,       // [r1]
++  const uint8_t * src2,       // [r2]
++  unsigned int lines);        // [r3]
++
++void ff_rpi_sand8_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand8_lines_to_planar_c8(
++  uint8_t * dst_u,            // [r0]
++  unsigned int dst_stride_u,  // [r1]
++  uint8_t * dst_v,            // [r2]
++  unsigned int dst_stride_v,  // [r3]
++  const uint8_t * src,        // [sp, #0]  -> r4, r5
++  unsigned int stride1,       // [sp, #4]  128
++  unsigned int stride2,       // [sp, #8]  -> r8
++  unsigned int _x,            // [sp, #12] 0
++  unsigned int y,             // [sp, #16] (r7 in prefix)
++  unsigned int _w,            // [sp, #20] -> r12, r6
++  unsigned int h);            // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y16(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_c16(
++  uint8_t * dst_u,            // [r0]
++  unsigned int dst_stride_u,  // [r1]
++  uint8_t * dst_v,            // [r2]
++  unsigned int dst_stride_v,  // [r3]
++  const uint8_t * src,        // [sp, #0]  -> r4, r5
++  unsigned int stride1,       // [sp, #4]  128
++  unsigned int stride2,       // [sp, #8]  -> r8
++  unsigned int _x,            // [sp, #12] 0
++  unsigned int y,             // [sp, #16] (r7 in prefix)
++  unsigned int _w,            // [sp, #20] -> r6, r9
++  unsigned int h);            // [sp, #24] -> r7
++
++void ff_rpi_sand30_lines_to_planar_p010(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++void ff_rpi_sand30_lines_to_planar_y8(
++  uint8_t * dest,             // [r0]
++  unsigned int dst_stride,    // [r1]
++  const uint8_t * src,        // [r2]
++  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
++  unsigned int src_stride2,   // [sp, #0]  -> r3
++  unsigned int _x,            // [sp, #4]  Ignored - 0
++  unsigned int y,             // [sp, #8]  (r7 in prefix)
++  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
++  unsigned int h);            // [sp, #16] -> r7
++
++#endif // AVUTIL_ARM_SAND_NEON_H
++
+diff --git a/libavutil/frame.c b/libavutil/frame.c
+index 75e347bf2f..daa6477485 100644
+--- a/libavutil/frame.c
++++ b/libavutil/frame.c
+@@ -16,6 +16,8 @@
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ 
++#include "config.h"
++
+ #include "channel_layout.h"
+ #include "avassert.h"
+ #include "buffer.h"
+@@ -26,6 +28,9 @@
+ #include "mem.h"
+ #include "samplefmt.h"
+ #include "hwcontext.h"
++#if CONFIG_SAND
++#include "rpi_sand_fns.h"
++#endif
+ 
+ #if FF_API_FRAME_GET_SET
+ MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
+@@ -903,6 +908,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
+         (frame->crop_top + frame->crop_bottom) >= frame->height)
+         return AVERROR(ERANGE);
+ 
++#if CONFIG_SAND
++    // Sand cannot be cropped - do not try
++    if (av_rpi_is_sand_format(frame->format))
++        return 0;
++#endif
++
+     desc = av_pix_fmt_desc_get(frame->format);
+     if (!desc)
+         return AVERROR_BUG;
+diff --git a/libavutil/frame.h b/libavutil/frame.h
+index 7d1f8e2935..a4e7dc915d 100644
+--- a/libavutil/frame.h
++++ b/libavutil/frame.h
+@@ -990,6 +990,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
+  */
+ const char *av_frame_side_data_name(enum AVFrameSideDataType type);
+ 
++
++static inline int av_frame_cropped_width(const AVFrame * const frame)
++{
++    return frame->width - (frame->crop_left + frame->crop_right);
++}
++static inline int av_frame_cropped_height(const AVFrame * const frame)
++{
++    return frame->height - (frame->crop_top + frame->crop_bottom);
++}
++
+ /**
+  * @}
+  */
+diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
+index 7a9fdbd263..2f825b7e16 100644
+--- a/libavutil/hwcontext_drm.c
++++ b/libavutil/hwcontext_drm.c
+@@ -21,6 +21,7 @@
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <unistd.h>
++#include <sys/ioctl.h>
+ 
+ /* This was introduced in version 4.6. And may not exist all without an
+  * optional package. So to prevent a hard dependency on needing the Linux
+@@ -31,6 +32,7 @@
+ #endif
+ 
+ #include <drm.h>
++#include <libdrm/drm_fourcc.h>
+ #include <xf86drm.h>
+ 
+ #include "avassert.h"
+@@ -38,7 +40,9 @@
+ #include "hwcontext_drm.h"
+ #include "hwcontext_internal.h"
+ #include "imgutils.h"
+-
++#if CONFIG_SAND
++#include "libavutil/rpi_sand_fns.h"
++#endif
+ 
+ static void drm_device_free(AVHWDeviceContext *hwdev)
+ {
+@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
+     AVDRMDeviceContext *hwctx = hwdev->hwctx;
+     drmVersionPtr version;
+ 
++    if (device == NULL) {
++        hwctx->fd = -1;
++        return 0;
++    }
++
+     hwctx->fd = open(device, O_RDWR);
+     if (hwctx->fd < 0)
+         return AVERROR(errno);
+@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+     if (flags & AV_HWFRAME_MAP_WRITE)
+         mmap_prot |= PROT_WRITE;
+ 
++    if (dst->format == AV_PIX_FMT_NONE)
++        dst->format = hwfc->sw_format;
+ #if HAVE_LINUX_DMA_BUF_H
+     if (flags & AV_HWFRAME_MAP_READ)
+         map->sync_flags |= DMA_BUF_SYNC_READ;
+@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
+ 
+     dst->width  = src->width;
+     dst->height = src->height;
++    dst->crop_top    = src->crop_top;
++    dst->crop_bottom = src->crop_bottom;
++    dst->crop_left   = src->crop_left;
++    dst->crop_right  = src->crop_right;
++
++#if CONFIG_SAND
++    // Rework for sand frames
++    if (av_rpi_is_sand_frame(dst)) {
++        // As it stands the sand formats hold stride2 in linesize[3]
++        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
++        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
++        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
++        dst->linesize[0] = 128;
++        dst->linesize[1] = 128;
++        // *** Are we sure src->height is actually what we want ???
++    }
++#endif
+ 
+     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
+                                 &drm_unmap_frame, map);
+@@ -206,16 +234,29 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
+                                     enum AVHWFrameTransferDirection dir,
+                                     enum AVPixelFormat **formats)
+ {
+-    enum AVPixelFormat *pix_fmts;
++    enum AVPixelFormat *p;
+ 
+-    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
+-    if (!pix_fmts)
++    p = *formats = av_malloc_array(3, sizeof(*p));
++    if (!p)
+         return AVERROR(ENOMEM);
+ 
+-    pix_fmts[0] = ctx->sw_format;
+-    pix_fmts[1] = AV_PIX_FMT_NONE;
++    // **** Offer native sand too ????
++    *p++ =
++#if CONFIG_SAND
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
++            AV_PIX_FMT_YUV420P :
++        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
++            AV_PIX_FMT_YUV420P10LE :
++#endif
++            ctx->sw_format;
++
++#if CONFIG_SAND
++    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
++        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
++        *p++ = AV_PIX_FMT_NV12;
++#endif
+ 
+-    *formats = pix_fmts;
++    *p = AV_PIX_FMT_NONE;
+     return 0;
+ }
+ 
+@@ -231,18 +272,63 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
+     map = av_frame_alloc();
+     if (!map)
+         return AVERROR(ENOMEM);
+-    map->format = dst->format;
+ 
++    // Map to default
++    map->format = AV_PIX_FMT_NONE;
+     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
+     if (err)
+         goto fail;
+ 
+-    map->width  = dst->width;
+-    map->height = dst->height;
++#if 0
++    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
++           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
++           map->width, map->height,
++           map->linesize[0],
++           map->linesize[1],
++           map->linesize[2],
++           map->linesize[3],
++           dst->width, dst->height,
++           dst->linesize[0],
++           dst->linesize[1],
++           dst->linesize[2]);
++#endif
++#if CONFIG_SAND
++    if (av_rpi_is_sand_frame(map)) {
++        // Preserve crop - later ffmpeg code assumes that we have in that it
++        // overwrites any crop that we create with the old values
++        unsigned int stride2 = map->linesize[3];
++        const unsigned int w = FFMIN(dst->width, map->width);
++        const unsigned int h = FFMIN(dst->height, map->height);
++
++        map->crop_top = 0;
++        map->crop_bottom = 0;
++        map->crop_left = 0;
++        map->crop_right = 0;
++
++        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
++        {
++            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
++            err = AVERROR(EINVAL);
++            goto fail;
++        }
++
++        dst->width = w;
++        dst->height = h;
++    }
++    else
++#endif
++    {
++        // Kludge mapped h/w s.t. frame_copy works
++        map->width  = dst->width;
++        map->height = dst->height;
++        err = av_frame_copy(dst, map);
++    }
+ 
+-    err = av_frame_copy(dst, map);
+     if (err)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
+         goto fail;
++    }
+ 
+     err = 0;
+ fail:
+@@ -257,7 +343,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
+     int err;
+ 
+     if (src->width > hwfc->width || src->height > hwfc->height)
++    {
++        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
+         return AVERROR(EINVAL);
++    }
+ 
+     map = av_frame_alloc();
+     if (!map)
+diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
+index 18c7a0efc8..bab13a4d50 100644
+--- a/libavutil/pixdesc.c
++++ b/libavutil/pixdesc.c
+@@ -2395,6 +2395,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
+         .name = "vulkan",
+         .flags = AV_PIX_FMT_FLAG_HWACCEL,
+     },
++    [AV_PIX_FMT_SAND128] = {
++        .name = "sand128",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 1, 0, 0, 8, 0, 7, 1 },        /* Y */
++            { 1, 2, 0, 0, 8, 1, 7, 1 },        /* U */
++            { 1, 2, 1, 0, 8, 1, 7, 2 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_10] = {
++        .name = "sand64_10",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 10, 0, 9, 1 },        /* Y */
++            { 1, 4, 0, 0, 10, 3, 9, 1 },        /* U */
++            { 1, 4, 2, 0, 10, 3, 9, 3 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_SAND64_16] = {
++        .name = "sand64_16",
++        .nb_components = 3,
++        .log2_chroma_w = 1,
++        .log2_chroma_h = 1,
++        .comp = {
++            { 0, 2, 0, 0, 16, 0, 15, 1 },        /* Y */
++            { 1, 4, 0, 0, 16, 3, 15, 1 },        /* U */
++            { 1, 4, 2, 0, 16, 3, 15, 3 },        /* V */
++        },
++        .flags = 0,
++    },
++    [AV_PIX_FMT_RPI4_8] = {
++        .name = "rpi4_8",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
++    [AV_PIX_FMT_RPI4_10] = {
++        .name = "rpi4_10",
++        .flags = AV_PIX_FMT_FLAG_HWACCEL,
++    },
+ };
+ #if FF_API_PLUS1_MINUS1
+ FF_ENABLE_DEPRECATION_WARNINGS
+diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
+index 46ef211add..9195ead15f 100644
+--- a/libavutil/pixfmt.h
++++ b/libavutil/pixfmt.h
+@@ -357,6 +357,14 @@ enum AVPixelFormat {
+ 
+     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
+     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
++// RPI - not on ifdef so can be got at by calling progs
++// #define so code that uses this can know it is there
++#define AVUTIL_HAVE_PIX_FMT_SAND 1
++    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
++    AV_PIX_FMT_RPI4_8,
++    AV_PIX_FMT_RPI4_10,
+ 
+     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
+     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
+diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+new file mode 100644
+index 0000000000..0d5d203dc3
+--- /dev/null
++++ b/libavutil/rpi_sand_fn_pw.h
+@@ -0,0 +1,227 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++// * Included twice from rpi_sand_fn with different PW
++
++#define STRCAT(x,y) x##y
++
++#if PW == 1
++#define pixel uint8_t
++#define FUNC(f) STRCAT(f, 8)
++#elif PW == 2
++#define pixel uint16_t
++#define FUNC(f) STRCAT(f, 16)
++#else
++#error Unexpected PW
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x;
++    const unsigned int w = _w;
++    const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
++                                     src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
++            memcpy(dst, p, w);
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const uint8_t * p = p2;
++            uint8_t * d = dst;
++            memcpy(d, p1, w1);
++            d += w1;
++            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
++                memcpy(d, p, stride1);
++            }
++            memcpy(d, p, w3);
++        }
++    }
++}
++
++// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
++
++void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++
++#if PW == 1 && HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
++                                     src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            const pixel * p = (const pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * p = (const pixel *)p1;
++            pixel * du = (pixel *)dst_u;
++            pixel * dv = (pixel *)dst_v;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *du++ = *p++;
++                    *dv++ = *p++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *du++ = *p++;
++                *dv++ = *p++;
++            }
++        }
++    }
++}
++
++void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x = _x * 2;
++    const unsigned int w = _w * 2;
++    const unsigned int mask = stride1 - 1;
++    if ((x & ~mask) == ((x + w) & ~mask)) {
++        // All in one sand stripe
++        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++    else
++    {
++        // Two+ stripe
++        const unsigned int sstride = stride1 * stride2;
++        const unsigned int sstride_p = (sstride - stride1) / PW;
++
++        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
++        const uint8_t * p2 = p1 + sstride - (x & mask);
++        const unsigned int w1 = stride1 - (x & mask);
++        const unsigned int w3 = (x + w) & mask;
++        const unsigned int w2 = w - (w1 + w3);
++
++        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
++            unsigned int j;
++            const pixel * su = (const pixel *)src_u;
++            const pixel * sv = (const pixel *)src_v;
++            pixel * p = (pixel *)p1;
++            for (unsigned int k = 0; k < w1; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
++                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
++                    *p++ = *su++;
++                    *p++ = *sv++;
++                }
++            }
++            for (unsigned int k = 0; k < w3; k += 2 * PW) {
++                *p++ = *su++;
++                *p++ = *sv++;
++            }
++        }
++    }
++}
++
++
++#undef pixel
++#undef STRCAT
++#undef FUNC
++
+diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
+new file mode 100644
+index 0000000000..0626bb06cb
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.c
+@@ -0,0 +1,447 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#include "config.h"
++#include <stdint.h>
++#include <string.h>
++#include "rpi_sand_fns.h"
++#include "avassert.h"
++#include "frame.h"
++
++#if ARCH_ARM && HAVE_NEON
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#elif ARCH_AARCH64 && HAVE_NEON
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
++#define HAVE_SAND_ASM 1
++#else
++#define HAVE_SAND_ASM 0
++#endif
++
++#define PW 1
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#define PW 2
++#include "rpi_sand_fn_pw.h"
++#undef PW
++
++#if 1
++// Simple round
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    const unsigned int rnd = (1 << shr) >> 1;
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        *dst++ = (*src++ + rnd) >> shr;
++    }
++}
++#else
++// Dithered variation
++static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
++{
++    unsigned int rnd = (1 << shr) >> 1;
++    const unsigned int mask = ((1 << shr) - 1);
++    const uint16_t * src = (const uint16_t *)_src;
++
++    for (; n != 0; --n) {
++        rnd = *src++ + (rnd & mask);
++        *dst++ = rnd >> shr;
++    }
++}
++#endif
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// unclipped
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
++        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * d = (uint16_t *)dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = p3 & 0x3ff;
++            *d++ = (p3 >> 10) & 0x3ff;
++            *d++ = (p3 >> 20) & 0x3ff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = p3 & 0x3ff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 10) & 0x3ff;
++        }
++    }
++}
++
++
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 8;
++    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0 && have_neon(av_get_cpu_flags())) {
++        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
++                                       src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint16_t * du = (uint16_t *)dst_u;
++        uint16_t * dv = (uint16_t *)dst_v;
++
++        if (xskip0 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            if (xskip0 == 1)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = (p3b >>  0) & 0x3ff;
++            }
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            *du++ = (p3a >> 20) & 0x3ff;
++            *dv++ = p3b & 0x3ff;
++            *du++ = (p3b >> 10) & 0x3ff;
++            *dv++ = (p3b >> 20) & 0x3ff;
++
++            if (((x += 8) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3a = *p++;
++            const uint32_t p3b = *p++;
++
++            *du++ = p3a & 0x3ff;
++            *dv++ = (p3a >> 10) & 0x3ff;
++            if (xrem1 == 2)
++            {
++                *du++ = (p3a >> 20) & 0x3ff;
++                *dv++ = p3b & 0x3ff;
++            }
++        }
++    }
++}
++
++// Fetches a single patch - offscreen fixup not done here
++// w <= stride1
++// single lose bottom 2 bits truncation
++// _x & _w in pixels, strides in bytes
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h)
++{
++    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
++    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
++    const unsigned int x1 = ((_x + _w) / 3) * 4;
++    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
++    const unsigned int mask = stride1 - 1;
++    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
++    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
++
++#if HAVE_SAND_ASM
++    if (_x == 0) {
++        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
++        return;
++    }
++#endif
++
++    if (x0 == x1) {
++        // *******************
++        // Partial single word xfer
++        return;
++    }
++
++    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
++    {
++        unsigned int x = x0;
++        const uint32_t * p = (const uint32_t *)p0;
++        uint8_t * d = dst;
++
++        if (xskip0 != 0) {
++            const uint32_t p3 = *p++;
++
++            if (xskip0 == 1)
++                *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        while (x != x1) {
++            const uint32_t p3 = *p++;
++            *d++ = (p3 >> 2) & 0xff;
++            *d++ = (p3 >> 12) & 0xff;
++            *d++ = (p3 >> 22) & 0xff;
++
++            if (((x += 4) & mask) == 0)
++                p += slice_inc;
++        }
++
++        if (xrem1 != 0) {
++            const uint32_t p3 = *p;
++
++            *d++ = (p3 >> 2) & 0xff;
++            if (xrem1 == 2)
++                *d++ = (p3 >> 12) & 0xff;
++        }
++    }
++}
++
++
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr)
++{
++    const unsigned int n = dst_stride1 / 2;
++    unsigned int j;
++
++    // This is true for our current layouts
++    av_assert0(dst_stride1 == src_stride1);
++
++    // As we have the same stride1 for src & dest and src is wider than dest
++    // then if we loop on src we can always write contiguously to dest
++    // We make no effort to copy an exact width - round up to nearest src stripe
++    // as we will always have storage in dest for that
++
++#if ARCH_ARM && HAVE_NEON
++    if (shr == 3 && src_stride1 == 128) {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
++        }
++    }
++    else
++#endif
++    {
++        for (j = 0; j + n < w; j += dst_stride1) {
++            uint8_t * d = dst + j * dst_stride2;
++            const uint8_t * s1 = src + j * 2 * src_stride2;
++            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
++
++            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
++                cpy16_to_8(d, s1, n, shr);
++                cpy16_to_8(d + n, s2, n, shr);
++            }
++        }
++    }
++
++    // Fix up a trailing dest half stripe
++    if (j < w) {
++        uint8_t * d = dst + j * dst_stride2;
++        const uint8_t * s1 = src + j * 2 * src_stride2;
++
++        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
++            cpy16_to_8(d, s1, n, shr);
++        }
++    }
++}
++
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
++{
++    const int w = av_frame_cropped_width(src);
++    const int h = av_frame_cropped_height(src);
++    const int x = src->crop_left;
++    const int y = src->crop_top;
++
++    // We will crop as part of the conversion
++    dst->crop_top = 0;
++    dst->crop_left = 0;
++    dst->crop_bottom = 0;
++    dst->crop_right = 0;
++
++    switch (src->format){
++        case AV_PIX_FMT_SAND128:
++        case AV_PIX_FMT_RPI4_8:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2,  w/2, h/2);
++                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_SAND64_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x*2, y, w*2, h);
++                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y/2,  w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        case AV_PIX_FMT_RPI4_10:
++            switch (dst->format){
++                case AV_PIX_FMT_YUV420P10:
++                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
++                                             dst->data[2], dst->linesize[2],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w/2, h/2);
++                    break;
++                case AV_PIX_FMT_NV12:
++                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
++                                             src->data[0],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x, y, w, h);
++                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
++                                             src->data[1],
++                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
++                                             x/2, y/2, w, h/2);
++                    break;
++                default:
++                    return -1;
++            }
++            break;
++        default:
++            return -1;
++    }
++
++    return av_frame_copy_props(dst, src);
++}
+diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
+new file mode 100644
+index 0000000000..462ccb8abd
+--- /dev/null
++++ b/libavutil/rpi_sand_fns.h
+@@ -0,0 +1,188 @@
++/*
++Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
++All rights reserved.
++
++Redistribution and use in source and binary forms, with or without
++modification, are permitted provided that the following conditions are met:
++    * Redistributions of source code must retain the above copyright
++      notice, this list of conditions and the following disclaimer.
++    * Redistributions in binary form must reproduce the above copyright
++      notice, this list of conditions and the following disclaimer in the
++      documentation and/or other materials provided with the distribution.
++    * Neither the name of the copyright holder nor the
++      names of its contributors may be used to endorse or promote products
++      derived from this software without specific prior written permission.
++
++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
++DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++
++Authors: John Cox
++*/
++
++#ifndef AVUTIL_RPI_SAND_FNS
++#define AVUTIL_RPI_SAND_FNS
++
++#include "libavutil/frame.h"
++
++// For all these fns _x & _w are measured as coord * PW
++// For the C fns coords are in chroma pels (so luma / 2)
++// Strides are in bytes
++
++void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
++                             unsigned int stride1, unsigned int stride2,
++                             const uint8_t * src_u, const unsigned int src_stride_u,
++                             const uint8_t * src_v, const unsigned int src_stride_v,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
++                             uint8_t * dst_v, const unsigned int dst_stride_v,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
++                             const uint8_t * src,
++                             unsigned int stride1, unsigned int stride2,
++                             unsigned int _x, unsigned int y,
++                             unsigned int _w, unsigned int h);
++
++// w/h in pixels
++void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
++                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
++                         unsigned int w, unsigned int h, const unsigned int shr);
++
++
++// dst must contain required pixel format & allocated data buffers
++// Cropping on the src buffer will be honoured and dst crop will be set to zero
++int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
++
++
++static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
++{
++#ifdef RPI_ZC_SAND128_ONLY
++    // If we are sure we only only support 128 byte sand formats replace the
++    // var with a constant which should allow for better optimisation
++    return 128;
++#else
++    return frame->linesize[0];
++#endif
++}
++
++static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
++{
++    return frame->linesize[3];
++}
++
++
++static inline int av_rpi_is_sand_format(const int format)
++{
++    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
++{
++    return av_rpi_is_sand_format(frame->format);
++}
++
++static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
++}
++
++static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
++{
++    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
++}
++
++static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
++{
++    return (frame->format == AV_PIX_FMT_RPI4_10);
++}
++
++static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
++{
++    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
++}
++
++// If x is measured in bytes (not pixels) then this works for sand64_16 as
++// well as sand128 - but in the general case we work that out
++
++static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y + stride2 * x2;
++}
++
++static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
++{
++    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
++    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
++    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
++    const unsigned int x1 = x & (stride1 - 1);
++    const unsigned int x2 = x ^ x1;
++
++    return x1 + stride1 * y_c + stride2 * x2;
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
++}
++
++static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
++{
++    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
++}
++
++#endif
++
+diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
+index a9bf6ff9e0..6a0e2dcc09 100644
+--- a/libswscale/aarch64/rgb2rgb.c
++++ b/libswscale/aarch64/rgb2rgb.c
+@@ -30,6 +30,12 @@
+ void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride);
++void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ av_cold void rgb2rgb_init_aarch64(void)
+ {
+@@ -37,5 +43,7 @@ av_cold void rgb2rgb_init_aarch64(void)
+ 
+     if (have_neon(cpu_flags)) {
+         interleaveBytes = ff_interleave_bytes_neon;
++        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
++        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
+     }
+ }
+diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
+index d81110ec57..476ca723a0 100644
+--- a/libswscale/aarch64/rgb2rgb_neon.S
++++ b/libswscale/aarch64/rgb2rgb_neon.S
+@@ -77,3 +77,448 @@ function ff_interleave_bytes_neon, export=1
+ 0:
+         ret
+ endfunc
++
++// void ff_rgb24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++function ff_rgb24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld1             {v3.s}[2], [x15], #4
++        ld1             {v3.s}[1], [x15], #4
++        ld1             {v3.s}[0], [x15], #4
++        ld1             {v4.s}[2], [x15], #4
++        ld1             {v4.s}[1], [x15], #4
++        ld1             {v4.s}[0], [x15], #4
++        ld1             {v5.s}[2], [x15], #4
++        ld1             {v5.s}[1], [x15], #4
++        ld1             {v5.s}[0], [x15]
++        b               99f
++endfunc
++
++// void ff_bgr24toyv12_aarch64(
++//              const uint8_t *src,             // x0
++//              uint8_t *ydst,                  // x1
++//              uint8_t *udst,                  // x2
++//              uint8_t *vdst,                  // x3
++//              int width,                      // w4
++//              int height,                     // w5
++//              int lumStride,                  // w6
++//              int chromStride,                // w7
++//              int srcStr,                     // [sp, #0]
++//              int32_t *rgb2yuv);              // [sp, #8]
++
++// regs
++// v0-2         Src bytes - reused as chroma src
++// v3-5         Coeffs (packed very inefficiently - could be squashed)
++// v6           128b
++// v7           128h
++// v8-15        Reserved
++// v16-18       Lo Src expanded as H
++// v19          -
++// v20-22       Hi Src expanded as H
++// v23          -
++// v24          U out
++// v25          U tmp
++// v26          Y out
++// v27-29       Y tmp
++// v30          V out
++// v31          V tmp
++
++// Assumes Little Endian in tail stores & conversion matrix
++
++function ff_bgr24toyv12_aarch64, export=1
++        ldr             x15, [sp, #8]
++        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
++        ld3             {v3.s, v4.s, v5.s}[2], [x15]
++99:
++        ldr             w14, [sp, #0]
++        movi            v7.8b, #128
++        uxtl            v6.8h, v7.8b
++        // Ensure if nothing to do then we do nothing
++        cmp             w4, #0
++        b.le            90f
++        cmp             w5, #0
++        b.le            90f
++        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
++        // the remainder done in the tail
++        tst             w4, #15
++        b.eq            1f
++        sub             w4, w4, #16
++1:
++
++// -------------------- Even line body - YUV
++11:
++        subs            w9,  w4, #0
++        mov             x10, x0
++        mov             x11, x1
++        mov             x12, x2
++        mov             x13, x3
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++
++        b.gt            10b
++
++// -------------------- Even line tail - YUV
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        bic             v0.8h, #0xff, LSL #8
++        bic             v1.8h, #0xff, LSL #8
++        bic             v2.8h, #0xff, LSL #8
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // U
++        // Vector subscript *2 as we loaded into S but are only using H
++        smull           v24.4s, v0.4h, v3.h[2]
++        smlal           v24.4s, v1.4h, v4.h[2]
++        smlal           v24.4s, v2.4h, v5.h[2]
++        smull2          v25.4s, v0.8h, v3.h[2]
++        smlal2          v25.4s, v1.8h, v4.h[2]
++        smlal2          v25.4s, v2.8h, v5.h[2]
++
++        // V
++        smull           v30.4s, v0.4h, v3.h[4]
++        smlal           v30.4s, v1.4h, v4.h[4]
++        smlal           v30.4s, v2.4h, v5.h[4]
++        smull2          v31.4s, v0.8h, v3.h[4]
++        smlal2          v31.4s, v1.8h, v4.h[4]
++        smlal2          v31.4s, v2.8h, v5.h[4]
++
++        cmp             w9, #-16
++
++        shrn            v24.4h, v24.4s, #14
++        shrn2           v24.8h, v25.4s, #14
++        sqrshrn         v24.8b, v24.8h, #1
++        add             v24.8b, v24.8b, v7.8b     // +128
++        shrn            v30.4h, v30.4s, #14
++        shrn2           v30.8h, v31.4s, #14
++        sqrshrn         v30.8b, v30.8h, #1
++        add             v30.8b, v30.8b, v7.8b     // +128
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        st1             {v24.8b}, [x12], #8
++        st1             {v30.8b}, [x13], #8
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++        st1             {v24.s}[0],  [x12], #4
++        st1             {v30.s}[0],  [x13], #4
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++        st1             {v24.h}[2],  [x12], #2
++        st1             {v30.h}[2],  [x13], #2
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++        st1             {v24.b}[6],  [x12], #1
++        st1             {v30.b}[6],  [x13], #1
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++        st1             {v24.b}[7],  [x12]
++        st1             {v30.b}[7],  [x13]
++1:
++3:
++
++// -------------------- Odd line body - Y only
++
++        subs            w5, w5, #1
++        b.eq            90f
++
++        subs            w9,  w4, #0
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        mov             x10, x0
++        mov             x11, x1
++        b.lt            12f
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++        subs            w9, w9, #16
++        b.le            13f
++
++10:
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Testing shows it is faster to stack the smull/smlal ops together
++        // rather than interleave them between channels and indeed even the
++        // shift/add sections seem happier not interleaved
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        subs            w9, w9, #16
++
++        st1             {v26.16b}, [x11], #16
++
++        b.gt            10b
++
++// -------------------- Odd line tail - Y
++// If width % 16 == 0 then simply runs once with preloaded RGB
++// If other then deals with preload & then does remaining tail
++
++13:
++        // Body is simple copy of main loop body minus preload
++
++        uxtl            v16.8h, v0.8b
++        uxtl            v17.8h, v1.8b
++        uxtl            v18.8h, v2.8b
++
++        uxtl2           v20.8h, v0.16b
++        uxtl2           v21.8h, v1.16b
++        uxtl2           v22.8h, v2.16b
++
++        // Y0
++        smull           v26.4s, v16.4h, v3.h[0]
++        smlal           v26.4s, v17.4h, v4.h[0]
++        smlal           v26.4s, v18.4h, v5.h[0]
++        smull2          v27.4s, v16.8h, v3.h[0]
++        smlal2          v27.4s, v17.8h, v4.h[0]
++        smlal2          v27.4s, v18.8h, v5.h[0]
++        // Y1
++        smull           v28.4s, v20.4h, v3.h[0]
++        smlal           v28.4s, v21.4h, v4.h[0]
++        smlal           v28.4s, v22.4h, v5.h[0]
++        smull2          v29.4s, v20.8h, v3.h[0]
++        smlal2          v29.4s, v21.8h, v4.h[0]
++        smlal2          v29.4s, v22.8h, v5.h[0]
++
++        cmp             w9, #-16
++
++        shrn            v26.4h, v26.4s, #12
++        shrn2           v26.8h, v27.4s, #12
++        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
++        sqrshrun        v26.8b, v26.8h, #3
++        shrn            v28.4h, v28.4s, #12
++        shrn2           v28.8h, v29.4s, #12
++        add             v28.8h, v28.8h, v6.8h
++        sqrshrun2       v26.16b, v28.8h, #3
++        // Y0/Y1
++
++        // Here:
++        // w9 == 0      width % 16 == 0, tail done
++        // w9 > -16     1st tail done (16 pels), remainder still to go
++        // w9 == -16    shouldn't happen
++        // w9 > -32     2nd tail done
++        // w9 <= -32    shouldn't happen
++
++        b.lt            2f
++        st1             {v26.16b}, [x11], #16
++        cbz             w9, 3f
++
++12:
++        sub             w9, w9, #16
++
++        tbz             w9, #3, 1f
++        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
++1:      tbz             w9, #2, 1f
++        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
++1:      tbz             w9, #1, 1f
++        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
++        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
++1:      tbz             w9, #0, 13b
++        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
++        b               13b
++
++2:
++        tbz             w9, #3, 1f
++        st1             {v26.8b},    [x11], #8
++1:      tbz             w9, #2, 1f
++        st1             {v26.s}[2],  [x11], #4
++1:      tbz             w9, #1, 1f
++        st1             {v26.h}[6],  [x11], #2
++1:      tbz             w9, #0, 1f
++        st1             {v26.b}[14], [x11]
++1:
++3:
++
++// ------------------- Loop to start
++
++        add             x0, x0, w14, SXTX
++        add             x1, x1, w6, SXTX
++        add             x2, x2, w7, SXTX
++        add             x3, x3, w7, SXTX
++        subs            w5, w5, #1
++        b.gt            11b
++90:
++        ret
++endfunc
+diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
+index a7300f3ba4..ba1db155b0 100644
+--- a/libswscale/rgb2rgb.c
++++ b/libswscale/rgb2rgb.c
+@@ -83,6 +83,31 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
+                        int width, int height,
+                        int lumStride, int chromStride, int srcStride,
+                        int32_t *rgb2yuv);
++void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
++                       uint8_t *udst, uint8_t *vdst,
++                       int width, int height,
++                       int lumStride, int chromStride, int srcStride,
++                       int32_t *rgb2yuv);
++void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
++void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
++					  uint8_t *udst, uint8_t *vdst,
++					  int width, int height,
++					  int lumStride, int chromStride, int srcStride,
++					  int32_t *rgb2yuv);
+ void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                  int srcStride, int dstStride);
+ void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
+diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
+index 48bba1586a..6329533f18 100644
+--- a/libswscale/rgb2rgb.h
++++ b/libswscale/rgb2rgb.h
+@@ -82,6 +82,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
+ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                       uint8_t *vdst, int width, int height, int lumStride,
+                       int chromStride, int srcStride, int32_t *rgb2yuv);
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                      uint8_t *vdst, int width, int height, int lumStride,
++                      int chromStride, int srcStride, int32_t *rgb2yuv);
+ 
+ /**
+  * Height should be a multiple of 2 and width should be a multiple of 16.
+@@ -131,6 +134,26 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                               int width, int height,
+                               int lumStride, int chromStride, int srcStride,
+                               int32_t *rgb2yuv);
++extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                              int width, int height,
++                              int lumStride, int chromStride, int srcStride,
++                              int32_t *rgb2yuv);
++extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
++extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
++                             int width, int height,
++                             int lumStride, int chromStride, int srcStride,
++                             int32_t *rgb2yuv);
+ extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
+                         int srcStride, int dstStride);
+ 
+diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
+index 42c69801ba..e711589e1e 100644
+--- a/libswscale/rgb2rgb_template.c
++++ b/libswscale/rgb2rgb_template.c
+@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
+  * others are ignored in the C version.
+  * FIXME: Write HQ version.
+  */
+-void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+                    uint8_t *vdst, int width, int height, int lumStride,
+-                   int chromStride, int srcStride, int32_t *rgb2yuv)
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
+ {
+-    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
+-    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
+-    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
+     int y;
+     const int chromWidth = width >> 1;
+ 
+@@ -678,6 +679,19 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
+         ydst += lumStride;
+         src  += srcStride;
+ 
+@@ -700,6 +714,125 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
+             ydst[2 * i + 1] = Y;
+         }
++        if ((width & 1) != 0) {
++            unsigned int b = src[6 * i + 0];
++            unsigned int g = src[6 * i + 1];
++            unsigned int r = src[6 * i + 2];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
++        udst += chromStride;
++        vdst += chromStride;
++        ydst += lumStride;
++        src  += srcStride;
++    }
++}
++
++static const uint8_t x_rgb[9] = {
++    RY_IDX, GY_IDX, BY_IDX,
++    RU_IDX, GU_IDX, BU_IDX,
++    RV_IDX, GV_IDX, BV_IDX,
++};
++
++static const uint8_t x_bgr[9] = {
++     BY_IDX, GY_IDX, RY_IDX,
++     BU_IDX, GU_IDX, RU_IDX,
++     BV_IDX, GV_IDX, RV_IDX,
++};
++
++void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv,
++                   const uint8_t x[9])
++{
++    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
++    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
++    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
++    int y;
++    const int chromWidth = width >> 1;
++
++    for (y = 0; y < height; y += 2) {
++        int i;
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
++            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
++            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
++
++            udst[i]     = U;
++            vdst[i]     = V;
++            ydst[2 * i] = Y;
++        }
++        ydst += lumStride;
++        src  += srcStride;
++
++        if (y+1 == height)
++            break;
++
++        for (i = 0; i < chromWidth; i++) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++
++            b = src[8 * i + 6];
++            g = src[8 * i + 5];
++            r = src[8 * i + 4];
++
++            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++            ydst[2 * i + 1] = Y;
++        }
++        if ((width & 1) != 0) {
++            unsigned int b = src[8 * i + 2];
++            unsigned int g = src[8 * i + 1];
++            unsigned int r = src[8 * i + 0];
++
++            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
++
++            ydst[2 * i] = Y;
++        }
+         udst += chromStride;
+         vdst += chromStride;
+         ydst += lumStride;
+@@ -707,6 +840,37 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
+     }
+ }
+ 
++static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++// As the general code does no SIMD-like ops simply adding 1 to the src address
++// will fix the ignored alpha position
++static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
++}
++
++static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
++                   uint8_t *vdst, int width, int height, int lumStride,
++                   int chromStride, int srcStride, int32_t *rgb2yuv)
++{
++    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
++}
++
++
+ static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
+                               uint8_t *dest, int width, int height,
+                               int src1Stride, int src2Stride, int dstStride)
+@@ -980,6 +1144,11 @@ static av_cold void rgb2rgb_init_c(void)
+     yuy2toyv12         = yuy2toyv12_c;
+     planar2x           = planar2x_c;
+     ff_rgb24toyv12     = ff_rgb24toyv12_c;
++    ff_bgr24toyv12     = ff_bgr24toyv12_c;
++    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
++    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
++    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
++    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
+     interleaveBytes    = interleaveBytes_c;
+     deinterleaveBytes  = deinterleaveBytes_c;
+     vu9_to_vu12        = vu9_to_vu12_c;
+diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
+index c4dd8a4d83..da38d7f8ac 100644
+--- a/libswscale/swscale_unscaled.c
++++ b/libswscale/swscale_unscaled.c
+@@ -1655,6 +1655,91 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+     return srcSliceH;
+ }
+ 
++static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                              int srcStride[], int srcSliceY, int srcSliceH,
++                              uint8_t *dst[], int dstStride[])
++{
++    ff_bgr24toyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_bgrxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_rgbxtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xbgrtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
++static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
++                             int srcStride[], int srcSliceY, int srcSliceH,
++                             uint8_t *dst[], int dstStride[])
++{
++    ff_xrgbtoyv12(
++        src[0],
++        dst[0] +  srcSliceY       * dstStride[0],
++        dst[1] + (srcSliceY >> 1) * dstStride[1],
++        dst[2] + (srcSliceY >> 1) * dstStride[2],
++        c->srcW, srcSliceH,
++        dstStride[0], dstStride[1], srcStride[0],
++        c->input_rgb2yuv_table);
++    if (dst[3])
++        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
++    return srcSliceH;
++}
++
+ static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
+                              int srcStride[], int srcSliceY, int srcSliceH,
+                              uint8_t *dst[], int dstStride[])
+@@ -2035,6 +2120,32 @@ void ff_get_unscaled_swscale(SwsContext *c)
+         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
+         !(flags & SWS_ACCURATE_RND))
+         c->swscale = bgr24ToYv12Wrapper;
++    /* rgb24toYV12 */
++    if (srcFormat == AV_PIX_FMT_RGB24 &&
++        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = rgb24ToYv12Wrapper;
++
++    /* bgrxtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = bgrxToYv12Wrapper;
++    /* rgbx24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = rgbxToYv12Wrapper;
++    /* xbgrtoYV12 */
++    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = xbgrToYv12Wrapper;
++    /* xrgb24toYV12 */
++    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
++         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
++        !(flags & SWS_ACCURATE_RND))
++        c->swscale = xrgbToYv12Wrapper;
+ 
+     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
+     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
+diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
+index 6c38041ddb..12776ffec7 100644
+--- a/libswscale/tests/swscale.c
++++ b/libswscale/tests/swscale.c
+@@ -23,6 +23,7 @@
+ #include <string.h>
+ #include <inttypes.h>
+ #include <stdarg.h>
++#include <time.h>
+ 
+ #undef HAVE_AV_CONFIG_H
+ #include "libavutil/cpu.h"
+@@ -78,6 +79,15 @@ struct Results {
+     uint32_t crc;
+ };
+ 
++static int time_rep = 0;
++
++static uint64_t utime(void)
++{
++    struct timespec ts;
++    clock_gettime(CLOCK_MONOTONIC, &ts);
++    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
++}
++
+ // test by ref -> src -> dst -> out & compare out against ref
+ // ref & out are YV12
+ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+         goto end;
+     }
+ 
+-    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
++    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
+            desc_src->name, srcW, srcH,
+            desc_dst->name, dstW, dstH,
+            flags);
+@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
+ 
+     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
+ 
++    if (time_rep != 0)
++    {
++        const uint64_t now = utime();
++        uint64_t done;
++        for (i = 1; i != time_rep; ++i) {
++            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
++        }
++        done = utime();
++        printf(" T=%7"PRId64"us ", done-now);
++    }
++
+     for (i = 0; i < 4 && dstStride[i]; i++)
+         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
+                      dstStride[i] * dstH);
+@@ -355,56 +376,78 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
+     return 0;
+ }
+ 
+-#define W 96
+-#define H 96
+-
+ int main(int argc, char **argv)
+ {
++    unsigned int W = 96;
++    unsigned int H = 96;
++    unsigned int W2;
++    unsigned int H2;
++    unsigned int S;
+     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
+     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
+-    uint8_t *rgb_data   = av_malloc(W * H * 4);
+-    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
+-    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
+-    uint8_t *data       = av_malloc(4 * W * H);
+-    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
+-    int stride[4]       = { W, W, W, W };
+     int x, y;
+     struct SwsContext *sws;
+     AVLFG rand;
+     int res = -1;
+     int i;
+     FILE *fp = NULL;
+-
+-    if (!rgb_data || !data)
+-        return -1;
++    uint8_t *rgb_data;
++    uint8_t * rgb_src[4] = { NULL };
++    int rgb_stride[4]   = { 0 };
++    uint8_t *data;
++    uint8_t * src[4] = { NULL };
++    int stride[4]       = { 0 };
+ 
+     for (i = 1; i < argc; i += 2) {
++        const char * const arg2 = argv[i+1];
++
+         if (argv[i][0] != '-' || i + 1 == argc)
+             goto bad_option;
+         if (!strcmp(argv[i], "-ref")) {
+-            fp = fopen(argv[i + 1], "r");
++            fp = fopen(arg2, "r");
+             if (!fp) {
+-                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
++                fprintf(stderr, "could not open '%s'\n", arg2);
+                 goto error;
+             }
+         } else if (!strcmp(argv[i], "-cpuflags")) {
+             unsigned flags = av_get_cpu_flags();
+-            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
++            int ret = av_parse_cpu_caps(&flags, arg2);
+             if (ret < 0) {
+-                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid cpu flags %s\n", arg2);
+                 return ret;
+             }
+             av_force_cpu_flags(flags);
+         } else if (!strcmp(argv[i], "-src")) {
+-            srcFormat = av_get_pix_fmt(argv[i + 1]);
++            srcFormat = av_get_pix_fmt(arg2);
+             if (srcFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
+                 return -1;
+             }
+         } else if (!strcmp(argv[i], "-dst")) {
+-            dstFormat = av_get_pix_fmt(argv[i + 1]);
++            dstFormat = av_get_pix_fmt(arg2);
+             if (dstFormat == AV_PIX_FMT_NONE) {
+-                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
++                fprintf(stderr, "invalid pixel format %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-w")) {
++            char * p = NULL;
++            W = strtoul(arg2, &p, 0);
++            if (!W || *p) {
++                fprintf(stderr, "bad width %s\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-h")) {
++            char * p = NULL;
++            H = strtoul(arg2, &p, 0);
++            if (!H || *p) {
++                fprintf(stderr, "bad height '%s'\n", arg2);
++                return -1;
++            }
++        } else if (!strcmp(argv[i], "-t")) {
++            char * p = NULL;
++            time_rep = (int)strtol(arg2, &p, 0);
++            if (*p) {
++                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
+                 return -1;
+             }
+         } else {
+@@ -414,15 +457,34 @@ bad_option:
+         }
+     }
+ 
+-    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
++    S = (W + 15) & ~15;
++    rgb_data   = av_mallocz(S * H * 4);
++    rgb_src[0] = rgb_data;
++    rgb_stride[0]   = 4 * S;
++    data       = av_mallocz(4 * S * H);
++    src[0] = data;
++    src[1] = data + S * H;
++    src[2] = data + S * H * 2;
++    src[3] = data + S * H * 3;
++    stride[0] = S;
++    stride[1] = S;
++    stride[2] = S;
++    stride[3] = S;
++    H2 = H < 96 ? 8 : H / 12;
++    W2 = W < 96 ? 8 : W / 12;
++
++    if (!rgb_data || !data)
++        return -1;
++
++    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
+                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
+ 
+     av_lfg_init(&rand, 1);
+ 
+     for (y = 0; y < H; y++)
+         for (x = 0; x < W * 4; x++)
+-            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
+-    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
++            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
++    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
+     if (res < 0 || res != H) {
+         res = -1;
+         goto error;
+@@ -431,10 +493,10 @@ bad_option:
+     av_free(rgb_data);
+ 
+     if(fp) {
+-        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
++        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
+         fclose(fp);
+     } else {
+-        selfTest(src, stride, W, H, srcFormat, dstFormat);
++        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
+         res = 0;
+     }
+ error:
 diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
 new file mode 100644
-index 0000000000..b050971f63
+index 0000000000..2b62d660c0
 --- /dev/null
 +++ b/pi-util/BUILD.txt
-@@ -0,0 +1,59 @@
+@@ -0,0 +1,67 @@
 +Building Pi FFmpeg
 +==================
 +
@@ -74,6 +70731,8 @@ index 0000000000..b050971f63
 +         paths being confused and therefore running the wrong code,  Shared
 +         is what is needed, in most cases, when building for use by other
 +         programs.
++ --usr   Set install dir to /usr (i.e. system default) rather than in
++         <builddir>/install
 +
 +So for a static build
 +---------------------
@@ -87,25 +70746,31 @@ index 0000000000..b050971f63
 +For a shared build
 +------------------
 +
++There are two choices here
++
 +$ pi-util/conf_native.sh
-+
-+You will normally want an install target if shared. Note that the script has
-+set this up to be generated in out/<builddir>/install, you don't have to worry
-+about overwriting your system libs.
-+
 +$ make -j8 -C out/<builddir> install
 +
++This sets the install prefix to <builddir>/install and is probably what you
++want if you don't want to overwrite the system files.
++
 +You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
-+built or install the image on the system - you have to be careful to get rid
-+of all other ffmpeg libs or confusion may result.  There is a little script
-+that wipes all other versions - obviously use with care!
++built. You can copy the contents of <build dir>/install to /usr and that mostly
++works. The only downside is that paths in pkgconfig end up being set to the
++install directory in your build directory which may be less than ideal when
++building other packages.
 +
++The alternative if you just want to replace the system libs is:
++
++$ pi-util/conf_native.sh --usr
++$ make -j8 -C out/<builddir>
 +$ sudo pi-util/clean_usr_libs.sh
++$ sudo make -j8 -C out/<builddir> install
 +
-+Then simply copying from the install to /usr works
-+
-+$ sudo cp -r out/<builddir>/install/* /usr
-+
++The clean_usr_libs.sh step wipes any existing libs & includes (for all
++architectures) from the system which helps avoid confusion when running other
++progs as you can be sure you're not running old code which is unfortunately
++easy to do otherwise.
 +
 diff --git a/pi-util/NOTES.txt b/pi-util/NOTES.txt
 new file mode 100644
@@ -188,95 +70853,111 @@ index 0000000000..92bc13a3df
 --- /dev/null
 +++ b/pi-util/TESTMESA.txt
 @@ -0,0 +1,82 @@
-+# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
-+
-+# These assume that the drm_mmal test for Sand8 has been built on this Pi
-+# as build relies on many of the same files
-+
-+# 1st get everything required to build ffmpeg
-+# If sources aren't already enabled on your Pi then enable them
-+sudo su
-+sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
-+sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
-+mv /tmp/sources.list /etc/apt/
-+mv /tmp/raspi.list /etc/apt/sources.list.d/
-+apt update
-+
-+# Get dependancies
-+sudo apt build-dep ffmpeg
-+
-+sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
-+
-+# Enable H265 V4L2 request decoder
-+sudo su
-+echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
-+# You may also want to add more CMA if you are going to try 4k videos
-+# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
-+# dtoverlay=vc4-fkms-v3d,cma-512
-+reboot
-+# Check it has turned up
-+ls -la /dev/video*
-+# This should include video19
-+# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
-+
-+# Currently on the Pi the linux headers from the debian distro don't match
-+# the kernel that we ship and we need to update them - hopefully this step
-+# will be unneeded in the future
-+sudo apt install git bc bison flex libssl-dev make
-+git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
-+cd linux
-+KERNEL=kernel7l
-+make bcm2711_defconfig
-+make headers_install
-+sudo cp -r usr/include/linux /usr/include
-+cd ..
-+
-+# Config - this builds a staticly linked ffmpeg which is easier for testing
-+pi-util/conf_native.sh --noshared
-+
-+# Build (this is a bit dull)
-+# If you want to poke the source the libavdevice/egl_vout.c contains the
-+# output code -
-+cd out/armv7-static-rel
-+
-+# Check that you have actually configured V4L2 request
-+grep HEVC_V4L2REQUEST config.h
-+# You are hoping for
-+# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
-+# if you get 0 then the config has failed
-+
-+make -j6
-+
-+# Grab test streams
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
-+wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
-+
-+# Test i420 output (works currently)
-+./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
-+
-+# Test Sand8 output - doesn't currently work but should once you have
-+# Sand8 working in drm_mmal. I can't guarantee that this will work as
-+# I can't test this path with a known working format, but the debug looks
-+# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
-+# The "show_all 1" forces vout to display every frame otherwise it drops any
-+# frame that would cause it to block
-+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
-+
-+# Test Sand30 - doesn't currently work
-+# (Beware that when FFmpeg errors out it often leaves your teminal window
-+# in a state where you need to reset it)
-+./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
-+
-+
-+
++# Setup & Build instructions for testing Argon30 mesa support (on Pi4)
++
++# These assume that the drm_mmal test for Sand8 has been built on this Pi
++# as build relies on many of the same files
++
++# 1st get everything required to build ffmpeg
++# If sources aren't already enabled on your Pi then enable them
++sudo su
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list > /tmp/sources.list
++sed "s/#deb-src/deb-src/" /etc/apt/sources.list.d/raspi.list > /tmp/raspi.list
++mv /tmp/sources.list /etc/apt/
++mv /tmp/raspi.list /etc/apt/sources.list.d/
++apt update
++
++# Get dependancies
++sudo apt build-dep ffmpeg
++
++sudo apt install meson libepoxy-dev libxcb-dri3-dev libxcb1-dev libx11-dev libx11-xcb-dev libdrm-dev
++
++# Enable H265 V4L2 request decoder
++sudo su
++echo dtoverlay=rpivid-v4l2 >> /boot/config.txt
++# You may also want to add more CMA if you are going to try 4k videos
++# Change the dtoverlay=vc4-fkms-v3d line in config.txt to read
++# dtoverlay=vc4-fkms-v3d,cma-512
++reboot
++# Check it has turned up
++ls -la /dev/video*
++# This should include video19
++# crw-rw----+ 1 root video 81, 7 Aug  4 17:25 /dev/video19
++
++# Currently on the Pi the linux headers from the debian distro don't match
++# the kernel that we ship and we need to update them - hopefully this step
++# will be unneeded in the future
++sudo apt install git bc bison flex libssl-dev make
++git clone --depth=1 https://github.com/raspberrypi/linux --branch rpi-5.10.y
++cd linux
++KERNEL=kernel7l
++make bcm2711_defconfig
++make headers_install
++sudo cp -r usr/include/linux /usr/include
++cd ..
++
++# Config - this builds a staticly linked ffmpeg which is easier for testing
++pi-util/conf_native.sh --noshared
++
++# Build (this is a bit dull)
++# If you want to poke the source the libavdevice/egl_vout.c contains the
++# output code -
++cd out/armv7-static-rel
++
++# Check that you have actually configured V4L2 request
++grep HEVC_V4L2REQUEST config.h
++# You are hoping for
++# #define CONFIG_HEVC_V4L2REQUEST_HWACCEL 1
++# if you get 0 then the config has failed
++
++make -j6
++
++# Grab test streams
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-h264.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc.mkv
++wget http://www.jell.yfish.us/media/jellyfish-3-mbps-hd-hevc-10bit.mkv
++
++# Test i420 output (works currently)
++./ffmpeg -no_cvt_hw -vcodec h264_v4l2m2m -i jellyfish-3-mbps-hd-h264.mkv -f vout_egl -
++
++# Test Sand8 output - doesn't currently work but should once you have
++# Sand8 working in drm_mmal. I can't guarantee that this will work as
++# I can't test this path with a known working format, but the debug looks
++# good.  If this doesn't work & drm_mmal does with sand8 then come back to me
++# The "show_all 1" forces vout to display every frame otherwise it drops any
++# frame that would cause it to block
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc.mkv -show_all 1 -f vout_egl -
++
++# Test Sand30 - doesn't currently work
++# (Beware that when FFmpeg errors out it often leaves your teminal window
++# in a state where you need to reset it)
++./ffmpeg -no_cvt_hw -hwaccel drm -vcodec hevc -i jellyfish-3-mbps-hd-hevc-10bit.mkv -f vout_egl -
++
++
++
 diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
 new file mode 100755
-index 0000000000..b3b2d5509d
+index 0000000000..01bd6a6a22
 --- /dev/null
 +++ b/pi-util/clean_usr_libs.sh
-@@ -0,0 +1,26 @@
+@@ -0,0 +1,42 @@
 +set -e
++U=/usr/include/arm-linux-gnueabihf
++rm -rf $U/libavcodec
++rm -rf $U/libavdevice
++rm -rf $U/libavfilter
++rm -rf $U/libavformat
++rm -rf $U/libavutil
++rm -rf $U/libswresample
++rm -rf $U/libswscale
++U=/usr/include/aarch64-linux-gnu
++rm -rf $U/libavcodec
++rm -rf $U/libavdevice
++rm -rf $U/libavfilter
++rm -rf $U/libavformat
++rm -rf $U/libavutil
++rm -rf $U/libswresample
++rm -rf $U/libswscale
 +U=/usr/lib/arm-linux-gnueabihf
 +rm -f $U/libavcodec.*
 +rm -f $U/libavdevice.*
@@ -359,510 +71040,510 @@ index 0000000000..4efd5d1c67
 --- /dev/null
 +++ b/pi-util/conf_h265.2016.csv
 @@ -0,0 +1,195 @@
-+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
-+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
-+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
-+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
-+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
-+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
-+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
-+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
-+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
-+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
-+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
-+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
-+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
-+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
-+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
-+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
-+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
-+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
-+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
-+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
-+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
-+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
-+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
-+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
-+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
-+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
-+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
-+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
-+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
-+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
-+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
-+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
-+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
-+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
-+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
-+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
-+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
-+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
-+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
-+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
-+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
-+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
-+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
-+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
-+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
-+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
-+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
-+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
-+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
-+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
-+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
-+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
-+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
-+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
-+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
-+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
-+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
-+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
-+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
-+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
-+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
-+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
-+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
-+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
-+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
-+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
-+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
-+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
-+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
-+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
-+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
-+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
-+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
-+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
-+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
-+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
-+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
-+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
-+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
-+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
-+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
-+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
-+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
-+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
-+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
-+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
-+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
-+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
-+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
-+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
-+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
-+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
-+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
-+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
-+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
-+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
-+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
-+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
-+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
-+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
-+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
-+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
-+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
-+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
-+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
-+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
-+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
-+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
-+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
-+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
-+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
-+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
-+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
-+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
-+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
-+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
-+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
-+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
-+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
-+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
-+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
-+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
-+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
-+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
-+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
-+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
-+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
-+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
-+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
-+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
-+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
-+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
-+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
-+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
-+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
-+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
-+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
-+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
-+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
-+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
-+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
-+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
-+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
-+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
-+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
-+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
-+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
-+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
-+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
-+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
-+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
-+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
-+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
-+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
-+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
-+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
-+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
++1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5,8
++1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5,8
++1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5,8
++1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5,8
++1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5,8
++1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5,8
++1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5,8
++1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5,8
++1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5,8
++1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5,8
++1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5,8
++1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5,8
++1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5,8
++1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5,8
++1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5,10
++1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5,8
++1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5,8
++1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5,8
++1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5,8
++1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5,8
++1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5,8
++1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5,8
++1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5,8
++1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5,8
++1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5,8
++1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5,8
++1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5,8
++1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5,8
++1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5,10
++1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5,8
++1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5,8
++1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5,8
++1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5,8
++1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5,8
++1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5,8
++1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5,8
++1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5,8
++1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5,8
++1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5,8
++1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5,8
++1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5,8
++1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5,8
++1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5,8
++1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5,8
++1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5,8
++1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5,8
++1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5,8
++1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5,8
++1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5,8
++1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5,8
++1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5,8
++1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5,8
++1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5,8
++1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5,8
++1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5,8
++1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5,8
++1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5,8
++1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5,8
++1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5,8
++1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5,8
++1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5,8
++1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5,8
++1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5,8
++1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5,8
++1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5,8
++1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5,8
++1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5,8
++1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5,8
++1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5,8
++1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5,8
++1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5,8
++1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5,8
++1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5,8
++1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5,8
++1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5,8
++1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5,8
++1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5,8
++1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5,8
++1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5,8
++1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5,8
++1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5,8
++1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5,8
++1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5,8
++1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5,8
++1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5,8
++1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5,8
++1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5,8
++1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5,8
++1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5,8
++1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt,8
++1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt,8
++1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5,8
++1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5,8
++1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5,8
++1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5,8
++1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5,8
++1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5,8
++1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5,8
++1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5,8
++1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5,8
++1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5,8
++1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5,8
++1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5,8
++1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5,8
++1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5,8
++1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5,8
++3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth,10
++1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5,8
++1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5,8
++3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???,8
++1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5,8
++1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5,8
++1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5,10
++1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5,8
++1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5,0
++0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt,8
++0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt,10
++0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt,8
++0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt,8
++1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt,0
++0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5,10
++0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5,8
++0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5,8
++1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5,10
++1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5,0
++1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5,0
++1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5,0
++0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5,0
++0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5,8
++0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5,8
++1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5,0
++1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5,8
++1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5,0
++1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5,0
++1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5,0
++1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt,0
++1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5,0
++1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5,0
++0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed,8
++0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5,10
++0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5,8
++0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5,8
++0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5,8
++1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5,8
++1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5,8
++1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5,8
++1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5,8
++1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5,8
 diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
 new file mode 100644
 index 0000000000..6082641271
 --- /dev/null
 +++ b/pi-util/conf_h265.2016_HEVC_v1.csv
 @@ -0,0 +1,147 @@
-+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
++1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
++1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
++1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
++2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
++2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
++1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
++1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
++1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
 new file mode 100644
 index 0000000000..fc14f2a3c2
 --- /dev/null
 +++ b/pi-util/conf_h265.csv
 @@ -0,0 +1,144 @@
-+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
-+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
-+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
-+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
-+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
-+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
-+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
++1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
++1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
++1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
++1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
++1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
++1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
++1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
++1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
++1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
++1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
++1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
++1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
++1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
++1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
++1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
++1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
++1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
++1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
++1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
++1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
++1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
++1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
++1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
++1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
++1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
++1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
++1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
++1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
++1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
++1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
++1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
++1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
++1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
++1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
++1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
++1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
++1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
++1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
++1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
++1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
++1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
++1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
++1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
++1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
++1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
++1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
++1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
++1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
++1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
++1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
++1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
++1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
++1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
++1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
++1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
++1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
++1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
++1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
++1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
++1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
++1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
++1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
++1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
++1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
++1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
++1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
++1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
++1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
++1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
++1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
++1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
++1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
++1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
++1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
++1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
++1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
++1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
++1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
++1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
++1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
++1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
++1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
++1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
++1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
++1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
++1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
++1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
++1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
++1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
++1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
++1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
++1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
++1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
++1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
++1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
++1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
++1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
++1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
++1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
++1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
++1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
++1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
++1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
++1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
++1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
++1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
++1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
++1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
++1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
++1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
++1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
++1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
++1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
++1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
++1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
++1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
++1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
++1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
++1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
++1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
++1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
++1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
++1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
++1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
++1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
++0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
++1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
++1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
++1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
++1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
++1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
++1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
++1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
++1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
++1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
++1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
++1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
++1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
++1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
++1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
++1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
++1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
++1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
++1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
 diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
 new file mode 100755
-index 0000000000..65576846e8
+index 0000000000..5fb69ccee2
 --- /dev/null
 +++ b/pi-util/conf_native.sh
-@@ -0,0 +1,108 @@
+@@ -0,0 +1,127 @@
 +echo "Configure for native build"
 +
 +FFSRC=`pwd`
@@ -874,6 +71555,7 @@ index 0000000000..65576846e8
 +
 +NOSHARED=
 +MMAL=
++USR_PREFIX=
 +
 +while [ "$1" != "" ] ; do
 +    case $1 in
@@ -883,8 +71565,14 @@ index 0000000000..65576846e8
 +	--mmal)
 +	    MMAL=1
 +	    ;;
++	--usr)
++	    USR_PREFIX=/usr
++	    ;;
 +	*)
-+	    echo "Usage $0: [--noshared] [--mmal]"
++	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
++	    echo "  noshared  Build static libs and executable - good for testing"
++	    echo "  mmal      Build mmal decoders"
++	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
 +	    exit 1
 +	    ;;
 +    esac
@@ -898,18 +71586,28 @@ index 0000000000..65576846e8
 +RPI_DEFINES=
 +RPI_EXTRALIBS=
 +
-+if [ "$MC" == "arm64" ]; then
-+  echo "M/C aarch64"
-+  A=aarch64-linux-gnu
-+  B=arm64
-+elif [ "$MC" == "armhf" ]; then
-+  echo "M/C armv7"
-+  A=arm-linux-gnueabihf
-+  B=armv7
-+  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-+  RPI_DEFINES=-mfpu=neon-vfpv4
++# uname -m gives kernel type which may not have the same
++# 32/64bitness as userspace :-( getconf shoudl provide the answer
++# but use uname to check we are on the right processor
++MC=`uname -m`
++LB=`getconf LONG_BIT`
++if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then
++  if [ "$LB" == "32" ]; then
++    echo "M/C armv7"
++    A=arm-linux-gnueabihf
++    B=armv7
++    MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
++    RPI_DEFINES=-mfpu=neon-vfpv4
++  elif [ "$LB" == "64" ]; then
++    echo "M/C aarch64"
++    A=aarch64-linux-gnu
++    B=arm64
++  else
++    echo "Unknown LONG_BIT name: $LB"
++    exit 1
++  fi
 +else
-+  echo Unexpected architecture $MC
++  echo "Unknown machine name: $MC"
 +  exit 1
 +fi
 +
@@ -937,7 +71635,9 @@ index 0000000000..65576846e8
 +  OUT=$BUILDBASE/$B-$C-$V-shared-rel
 +fi
 +
-+USR_PREFIX=$OUT/install
++if [ ! $USR_PREFIX ]; then
++  USR_PREFIX=$OUT/install
++fi
 +LIB_PREFIX=$USR_PREFIX/lib/$A
 +INC_PREFIX=$USR_PREFIX/include/$A
 +
@@ -956,10 +71656,9 @@ index 0000000000..65576846e8
 + --disable-thumb\
 + --enable-v4l2-request\
 + --enable-libdrm\
-+ --enable-epoxy\
-+ --enable-libudev\
 + --enable-vout-egl\
 + --enable-vout-drm\
++ --enable-gpl\
 + $SHARED_LIBS\
 + $RPIOPTS\
 + --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
@@ -968,6 +71667,7 @@ index 0000000000..65576846e8
 + --extra-libs="$RPI_EXTRALIBS"\
 + --extra-version="rpi"
 +
++echo "Configured into $OUT"
 +
 +# gcc option for getting asm listing
 +# -Wa,-ahls
@@ -1544,6 +72244,95 @@ index 0000000000..a4dbb6eacd
 +$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
 +$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
 +
+diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py
+new file mode 100755
+index 0000000000..b322dac0c2
+--- /dev/null
++++ b/pi-util/testfilt.py
+@@ -0,0 +1,83 @@
++#!/usr/bin/env python3
++
++import string
++import os
++import subprocess
++import re
++import argparse
++import sys
++import csv
++from stat import *
++
++class validator:
++    def __init__(self):
++        self.ok = False
++
++    def isok(self):
++        return self.ok
++
++    def setok(self):
++        self.ok = True
++
++class valid_regex(validator):
++    def __init__(self, regex):
++        super().__init__()
++        self.regex = re.compile(regex)
++
++    def scanline(self, line):
++        if self.isok() or self.regex.search(line):
++            self.setok()
++
++
++def validate(validators, flog):
++    for line in flog:
++        for v in validators:
++            v.scanline(line)
++
++    ok = True
++    for v in validators:
++        if not v.isok():
++            ok = False
++            # complain
++            print("Test failed")
++
++    if ok:
++        print("OK")
++    return ok
++
++def runtest(name, ffmpeg, args, suffix, validators):
++    log_root = os.path.join("/tmp", "testfilt", name)
++    ofilename = os.path.join(log_root, name + suffix)
++
++    if not os.path.exists(log_root):
++        os.makedirs(log_root)
++
++    try:
++        os.remove(ofilename)
++    except:
++        pass
++
++    flog = open(os.path.join(log_root, name + ".log"), "wb")
++    ffargs = [ffmpeg] + args + [ofilename]
++
++    subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False)
++    flog.close
++
++    flog = open(os.path.join(log_root, name + ".log"), "rt")
++    return validate(validators, flog)
++
++def sayok(log_root, flog):
++    print("Woohoo")
++    return True
++
++if __name__ == '__main__':
++
++    argp = argparse.ArgumentParser(description="FFmpeg filter tester")
++    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
++    args = argp.parse_args()
++
++    runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i",
++                                   "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv",
++#                                    "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv",
++                                   "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv",
++            [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')])
 diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
 new file mode 100755
 index 0000000000..5935a11ca5
@@ -1678,33280 +72467,1084 @@ index 0000000000..5935a11ca5
 +
 +    do_logparse(args.logfile)
 +
-
-From f3eaadb27a5bc6db07d33ce0814d796e8cee623e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 11:27:39 +0100
-Subject: [PATCH 002/136] Add sand pix fmts & conversion fns
-
----
- configure                     |   3 +
- libavutil/Makefile            |   3 +
- libavutil/arm/Makefile        |   1 +
- libavutil/arm/rpi_sand_neon.S | 768 ++++++++++++++++++++++++++++++++++
- libavutil/arm/rpi_sand_neon.h |  99 +++++
- libavutil/pixdesc.c           |  44 ++
- libavutil/pixfmt.h            |   6 +
- libavutil/rpi_sand_fn_pw.h    | 227 ++++++++++
- libavutil/rpi_sand_fns.c      | 353 ++++++++++++++++
- libavutil/rpi_sand_fns.h      | 183 ++++++++
- 10 files changed, 1687 insertions(+)
- create mode 100644 libavutil/arm/rpi_sand_neon.S
- create mode 100644 libavutil/arm/rpi_sand_neon.h
- create mode 100644 libavutil/rpi_sand_fn_pw.h
- create mode 100644 libavutil/rpi_sand_fns.c
- create mode 100644 libavutil/rpi_sand_fns.h
-
-diff --git a/configure b/configure
-index b6616f00b6..27112ced58 100755
---- a/configure
-+++ b/configure
-@@ -344,6 +344,7 @@ External library support:
-   --enable-libvpl          enable Intel oneVPL code via libvpl if libmfx is not used [no]
-   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-+  --enable-sand            enable sand video formats [rpi]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-@@ -1930,6 +1931,7 @@ FEATURE_LIST="
-     omx_rpi
-     runtime_cpudetect
-     safe_bitstream_reader
-+    sand
-     shared
-     small
-     static
-@@ -2495,6 +2497,7 @@ CONFIG_EXTRA="
-     rtpdec
-     rtpenc_chain
-     rv34dsp
-+    sand
-     scene_sad
-     sinewin
-     snappy
-diff --git a/libavutil/Makefile b/libavutil/Makefile
-index dc9012f9a8..e33f5db099 100644
---- a/libavutil/Makefile
-+++ b/libavutil/Makefile
-@@ -73,6 +73,7 @@ HEADERS = adler32.h                                                     \
-           rational.h                                                    \
-           replaygain.h                                                  \
-           ripemd.h                                                      \
-+	  rpi_sand_fns.h                                                \
-           samplefmt.h                                                   \
-           sha.h                                                         \
-           sha512.h                                                      \
-@@ -192,6 +193,7 @@ OBJS-$(CONFIG_MACOS_KPERF)              += macos_kperf.o
- OBJS-$(CONFIG_MEDIACODEC)               += hwcontext_mediacodec.o
- OBJS-$(CONFIG_OPENCL)                   += hwcontext_opencl.o
- OBJS-$(CONFIG_QSV)                      += hwcontext_qsv.o
-+OBJS-$(CONFIG_SAND)                     += rpi_sand_fns.o
- OBJS-$(CONFIG_VAAPI)                    += hwcontext_vaapi.o
- OBJS-$(CONFIG_VIDEOTOOLBOX)             += hwcontext_videotoolbox.o
- OBJS-$(CONFIG_VDPAU)                    += hwcontext_vdpau.o
-@@ -212,6 +214,7 @@ SKIPHEADERS-$(CONFIG_D3D11VA)          += hwcontext_d3d11va.h
- SKIPHEADERS-$(CONFIG_DXVA2)            += hwcontext_dxva2.h
- SKIPHEADERS-$(CONFIG_QSV)              += hwcontext_qsv.h
- SKIPHEADERS-$(CONFIG_OPENCL)           += hwcontext_opencl.h
-+SKIPHEADERS-$(CONFIG-RPI)              += rpi_sand_fn_pw.h
- SKIPHEADERS-$(CONFIG_VAAPI)            += hwcontext_vaapi.h
- SKIPHEADERS-$(CONFIG_VIDEOTOOLBOX)     += hwcontext_videotoolbox.h
- SKIPHEADERS-$(CONFIG_VDPAU)            += hwcontext_vdpau.h
-diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
-index 5da44b0542..b74b7c4e2f 100644
---- a/libavutil/arm/Makefile
-+++ b/libavutil/arm/Makefile
-@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o                                    \
+diff --git a/tests/checkasm/Makefile b/tests/checkasm/Makefile
+index 1827a4e134..3c765a5eb1 100644
+--- a/tests/checkasm/Makefile
++++ b/tests/checkasm/Makefile
+@@ -9,8 +9,10 @@ AVCODECOBJS-$(CONFIG_G722DSP)           += g722dsp.o
+ AVCODECOBJS-$(CONFIG_H264DSP)           += h264dsp.o
+ AVCODECOBJS-$(CONFIG_H264PRED)          += h264pred.o
+ AVCODECOBJS-$(CONFIG_H264QPEL)          += h264qpel.o
++AVCODECOBJS-$(CONFIG_IDCTDSP)           += idctdsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDDSP)          += llviddsp.o
+ AVCODECOBJS-$(CONFIG_LLVIDENCDSP)       += llviddspenc.o
++AVCODECOBJS-$(CONFIG_VC1DSP)            += vc1dsp.o
+ AVCODECOBJS-$(CONFIG_VP8DSP)            += vp8dsp.o
+ AVCODECOBJS-$(CONFIG_VIDEODSP)          += videodsp.o
  
- NEON-OBJS += arm/float_dsp_init_neon.o                                  \
-              arm/float_dsp_neon.o                                       \
-+             arm/rpi_sand_neon.o                                        \
-diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
-new file mode 100644
-index 0000000000..80890fe985
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,768 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "libavutil/arm/asm.S"
-+
-+
-+@ General notes:
-+@ Having done some timing on this in sand8->y8 (Pi4)
-+@  vst1 (680fps) is a bit faster than vstm (660fps)
-+@  vldm (680fps) is noticably faster than vld1 (480fps)
-+@  (or it might be that a mix is what is required)
-+@
-+@ At least on a Pi4 it is no more expensive to have a single auto-inc register
-+@ for dest address than it is to have 2 used alternately (On Pi3 Ben asserted
-+@ the latter was better)
-+@
-+@ vstm will bus error on unaligned access (so will vldm), vst1 is safe unless
-+@ the memory is uncached.
-+@ As these are Sand -> planar we can assume that src is going to be aligned but
-+@ it is possible that dest isn't (converting to .yuv or other packed format).
-+@ Luckily vst1 is faster than vstm :-) so all is well
-+@ vst1 has alignment requirements of el size so maybe splitting vst1.32 into 4
-+@ .8 stores would let us do non-word aligned stores into uncached but it
-+@ probably isn't worth it.
-+
-+
-+
-+
-+@ void ff_rpi_sand128b_stripe_to_8_10(
-+@   uint8_t * dest,             // [r0]
-+@   const uint8_t * src1,       // [r1]
-+@   const uint8_t * src2,       // [r2]
-+@   unsigned int lines);        // [r3]
-+
-+.macro  stripe2_to_8, bit_depth
-+        vpush    {q4-q7}
-+1:
-+        vldm     r1!, {q0-q7}
-+        subs     r3, #1
-+        vldm     r2!, {q8-q15}
-+        vqrshrn.u16 d0,  q0,  #\bit_depth - 8
-+        vqrshrn.u16 d1,  q1,  #\bit_depth - 8
-+        vqrshrn.u16 d2,  q2,  #\bit_depth - 8
-+        vqrshrn.u16 d3,  q3,  #\bit_depth - 8
-+        vqrshrn.u16 d4,  q4,  #\bit_depth - 8
-+        vqrshrn.u16 d5,  q5,  #\bit_depth - 8
-+        vqrshrn.u16 d6,  q6,  #\bit_depth - 8
-+        vqrshrn.u16 d7,  q7,  #\bit_depth - 8
-+        vqrshrn.u16 d8,  q8,  #\bit_depth - 8
-+        vqrshrn.u16 d9,  q9,  #\bit_depth - 8
-+        vqrshrn.u16 d10, q10, #\bit_depth - 8
-+        vqrshrn.u16 d11, q11, #\bit_depth - 8
-+        vqrshrn.u16 d12, q12, #\bit_depth - 8
-+        vqrshrn.u16 d13, q13, #\bit_depth - 8
-+        vqrshrn.u16 d14, q14, #\bit_depth - 8
-+        vqrshrn.u16 d15, q15, #\bit_depth - 8
-+        vstm     r0!, {q0-q7}
-+        bne      1b
-+        vpop     {q4-q7}
-+        bx       lr
-+.endm
-+
-+function ff_rpi_sand128b_stripe_to_8_10, export=1
-+        stripe2_to_8     10
-+endfunc
-+
-+@ void ff_rpi_sand8_lines_to_planar_y8(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand8_lines_to_planar_y8, export=1
-+                push            {r4-r8, lr}     @ +24            L
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                lsl             r3,  #7
-+                sub             r1,  r6
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2,  {q8-q15}
-+                add             r2,  r3
-+                subs            r5,  #128
-+                blt             2f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d20, d21, d22, d23}, [r0]!
-+                vst1.8          {d24, d25, d26, d27}, [r0]!
-+                vst1.8          {d28, d29, d30, d31}, [r0]!
-+                bne             1b
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #64-128
-+                blt             1f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d20, d21, d22, d23}, [r0]!
-+                beq             11b
-+                vmov            q8,  q12
-+                vmov            q9,  q13
-+                sub             r5,  #64
-+                vmov            q10, q14
-+                vmov            q11, q15
-+1:
-+                cmp             r5,  #32-128
-+                blt             1f
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                beq             11b
-+                vmov            q8,  q10
-+                sub             r5,  #32
-+                vmov            q9,  q11
-+1:
-+                cmp             r5,  #16-128
-+                blt             1f
-+                vst1.8          {d16, d17}, [r0]!
-+                beq             11b
-+                sub             r5,  #16
-+                vmov            q8,  q9
-+1:
-+                cmp             r5,  #8-128
-+                blt             1f
-+                vst1.8          {d16}, [r0]!
-+                beq             11b
-+                sub             r5,  #8
-+                vmov            d16, d17
-+1:
-+                cmp             r5,  #4-128
-+                blt             1f
-+                vst1.32         {d16[0]}, [r0]!
-+                beq             11b
-+                sub             r5,  #4
-+                vshr.u64        d16, #32
-+1:
-+                cmp             r5,  #2-128
-+                blt             1f
-+                vst1.16         {d16[0]}, [r0]!
-+                beq             11b
-+                vst1.8          {d16[2]}, [r0]!
-+                b               11b
-+1:
-+                vst1.8          {d16[0]}, [r0]!
-+                b               11b
-+endfunc
-+
-+@ void ff_rpi_sand8_lines_to_planar_c8(
-+@   uint8_t * dst_u,            // [r0]
-+@   unsigned int dst_stride_u,  // [r1]
-+@   uint8_t * dst_v,            // [r2]
-+@   unsigned int dst_stride_v,  // [r3]
-+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
-+@   unsigned int stride1,       // [sp, #4]  128
-+@   unsigned int stride2,       // [sp, #8]  -> r8
-+@   unsigned int _x,            // [sp, #12] 0
-+@   unsigned int y,             // [sp, #16] (r7 in prefix)
-+@   unsigned int _w,            // [sp, #20] -> r12, r6
-+@   unsigned int h);            // [sp, #24] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand8_lines_to_planar_c8, export=1
-+                push            {r4-r8, lr}     @ +24
-+
-+                ldr             r5,  [sp, #24]
-+                ldr             r8,  [sp, #32]
-+                ldr             r7,  [sp, #40]
-+                ldr             r6,  [sp, #44]
-+                lsl             r8,  #7
-+                add             r5,  r5,  r7,  lsl #7
-+                sub             r1,  r1,  r6
-+                sub             r3,  r3,  r6
-+                ldr             r7,  [sp, #48]
-+                vpush           {q4-q7}
-+
-+10:
-+                mov             r4,  r5
-+                mov             r12, r6
-+1:
-+                subs            r12, #64
-+                vldm            r4,  {q0-q7}
-+                add             r4,  r8
-+                it              gt
-+                vldmgt          r4,  {q8-q15}
-+                add             r4,  r8
-+
-+                vuzp.8          q0,  q1
-+                vuzp.8          q2,  q3
-+                vuzp.8          q4,  q5
-+                vuzp.8          q6,  q7
-+
-+                vuzp.8          q8,  q9
-+                vuzp.8          q10, q11
-+                vuzp.8          q12, q13
-+                vuzp.8          q14, q15
-+                subs            r12, #64
-+
-+                @ Rearrange regs so we can use vst1 with 4 regs
-+                vswp            q1,  q2
-+                vswp            q5,  q6
-+                vswp            q9,  q10
-+                vswp            q13, q14
-+                blt             2f
-+
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
-+                vst1.8          {d16, d17, d18, d19}, [r0]!
-+                vst1.8          {d24, d25, d26, d27}, [r0]!
-+
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                vst1.8          {d12, d13, d14, d15}, [r2]!
-+                vst1.8          {d20, d21, d22, d23}, [r2]!
-+                vst1.8          {d28, d29, d30, d31}, [r2]!
-+                bne             1b
-+11:
-+                subs            r7,  #1
-+                add             r5,  #128
-+                add             r0,  r1
-+                add             r2,  r3
-+                bne             10b
-+                vpop            {q4-q7}
-+                pop             {r4-r8,pc}
-+
-+2:
-+                cmp             r12, #64-128
-+                blt             1f
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d8,  d9,  d10, d11}, [r0]!
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                vst1.8          {d12, d13, d14, d15}, [r2]!
-+                beq             11b
-+                sub             r12, #64
-+                vmov            q0,  q8
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+                vmov            q3,  q11
-+                vmov            q4,  q12
-+                vmov            q5,  q13
-+                vmov            q6,  q14
-+                vmov            q7,  q15
-+1:
-+                cmp             r12, #32-128
-+                blt             1f
-+                vst1.8          {d0,  d1,  d2,  d3 }, [r0]!
-+                vst1.8          {d4,  d5,  d6,  d7 }, [r2]!
-+                beq             11b
-+                sub             r12, #32
-+                vmov            q0,  q4
-+                vmov            q1,  q5
-+                vmov            q2,  q6
-+                vmov            q3,  q7
-+1:
-+                cmp             r12, #16-128
-+                blt             1f
-+                vst1.8          {d0,  d1 }, [r0]!
-+                vst1.8          {d4,  d5 }, [r2]!
-+                beq             11b
-+                sub             r12, #16
-+                vmov            q0,  q1
-+                vmov            q2,  q3
-+1:
-+                cmp             r12, #8-128
-+                blt             1f
-+                vst1.8          {d0}, [r0]!
-+                vst1.8          {d4}, [r2]!
-+                beq             11b
-+                sub             r12, #8
-+                vmov            d0,  d1
-+                vmov            d4,  d5
-+1:
-+                cmp             r12, #4-128
-+                blt             1f
-+                vst1.32         {d0[0]}, [r0]!
-+                vst1.32         {d4[0]}, [r2]!
-+                beq             11b
-+                sub             r12, #4
-+                vmov            s0,  s1
-+                vmov            s8,  s9
-+1:
-+                cmp             r12, #2-128
-+                blt             1f
-+                vst1.16         {d0[0]}, [r0]!
-+                vst1.16         {d4[0]}, [r2]!
-+                beq             11b
-+                vst1.8          {d0[2]}, [r0]!
-+                vst1.8          {d4[2]}, [r2]!
-+                b               11b
-+1:
-+                vst1.8          {d0[0]}, [r0]!
-+                vst1.8          {d4[0]}, [r2]!
-+                b               11b
-+endfunc
-+
-+
-+
-+@ void ff_rpi_sand30_lines_to_planar_y16(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
-+                sub             r3,  #1
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6,  lsl #1
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2!, {q10-q13}
-+                add             lr,  #64
-+
-+                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
-+                ands            lr,  #127
-+                vshrn.u32       d2,  q10, #10
-+                vmovn.u32       d0,  q10
-+                vmovn.u32       d4,  q14
-+
-+                vshr.u32        q14, q11, #20
-+                it              eq
-+                addeq           r2,  r3
-+                vshrn.u32       d3,  q11, #10
-+                vmovn.u32       d1,  q11
-+                vmovn.u32       d5,  q14
-+
-+                subs            r5,  #48
-+                vand            q0,  q15
-+                vand            q1,  q15
-+                vand            q2,  q15
-+
-+                vshr.u32        q14, q12, #20
-+                vshrn.u32       d18, q12, #10
-+                vmovn.u32       d16, q12
-+                vmovn.u32       d20, q14
-+
-+                vshr.u32        q14, q13, #20
-+                vshrn.u32       d19, q13, #10
-+                vmovn.u32       d17, q13
-+                vmovn.u32       d21, q14
-+
-+                vand            q8,  q15
-+                vand            q9,  q15
-+                vand            q10, q15
-+                blt             2f
-+
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4], r12
-+                vst3.16         {d16, d18, d20}, [r0], r12
-+                vst3.16         {d17, d19, d21}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #24-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4]
-+                beq             11b
-+                vmov            q0,  q8
-+                sub             r5,  #24
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+1:
-+                cmp             r5,  #12-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0]!
-+                beq             11b
-+                vmov            d0, d1
-+                sub             r5, #12
-+                vmov            d2, d3
-+                vmov            d4, d5
-+1:
-+                cmp             r5,  #6-48
-+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
-+                add             r0,  #12
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #6
-+                vmov            s4,  s5
-+                vmov            s8,  s9
-+1:
-+                cmp             r5, #3-48
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #16
-+                vshr.u32        d2, #16
-+1:
-+                cmp             r5, #2-48
-+                blt             1f
-+                vst2.16         {d0[0], d2[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.16         {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
-+
-+@ void ff_rpi_sand30_lines_to_planar_c16(
-+@   uint8_t * dst_u,            // [r0]
-+@   unsigned int dst_stride_u,  // [r1]
-+@   uint8_t * dst_v,            // [r2]
-+@   unsigned int dst_stride_v,  // [r3]
-+@   const uint8_t * src,        // [sp, #0]  -> r4, r5
-+@   unsigned int stride1,       // [sp, #4]  128
-+@   unsigned int stride2,       // [sp, #8]  -> r8
-+@   unsigned int _x,            // [sp, #12] 0
-+@   unsigned int y,             // [sp, #16] (r7 in prefix)
-+@   unsigned int _w,            // [sp, #20] -> r6, r9
-+@   unsigned int h);            // [sp, #24] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_c16, export=1
-+                push            {r4-r10, lr}    @ +32
-+                ldr             r5,  [sp, #32]
-+                ldr             r8,  [sp, #40]
-+                ldr             r7,  [sp, #48]
-+                ldr             r9,  [sp, #52]
-+                mov             r12, #48
-+                vmov.u16        q15, #0x3ff
-+                sub             r8,  #1
-+                lsl             r8,  #7
-+                add             r5,  r5,  r7,  lsl #7
-+                sub             r1,  r1,  r9,  lsl #1
-+                sub             r3,  r3,  r9,  lsl #1
-+                ldr             r7,  [sp, #56]
-+10:
-+                mov             lr,  #0
-+                mov             r4,  r5
-+                mov             r6,  r9
-+1:
-+                vldm            r4!, {q0-q3}
-+                add             lr,  #64
-+
-+                @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
-+                vshr.u32        q14, q0,  #20
-+                vshrn.u32       d16, q0,  #10
-+                vmovn.u32       d18, q0
-+                ands            lr,  #127
-+                vmovn.u32       d20, q14
-+
-+                vshr.u32        q14, q1,  #20
-+                vshrn.u32       d17, q1,  #10
-+                vmovn.u32       d19, q1
-+                vmovn.u32       d21, q14
-+
-+                vshr.u32        q14, q2,  #20
-+                vshrn.u32       d22, q2,  #10
-+                vmovn.u32       d24, q2
-+                vmovn.u32       d26, q14
-+
-+                vshr.u32        q14, q3,  #20
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d25, q3
-+                add             r10, r0,  #24
-+                vmovn.u32       d27, q14
-+
-+                it              eq
-+                addeq           r4,  r8
-+                vuzp.16         q8,  q11
-+                vuzp.16         q9,  q12
-+                vuzp.16         q10, q13
-+
-+                @ q8   V0, V3,.. -> q0
-+                @ q9   U0, U3...
-+                @ q10  U1, U4...
-+                @ q11  U2, U5,..
-+                @ q12  V1, V4,.. -> q1
-+                @ q13  V2, V5,.. -> q2
-+
-+                subs            r6,  #24
-+                vand            q11, q15
-+                vand            q9,  q15
-+                vand            q10, q15
-+                vand            q0,  q8,  q15
-+                vand            q1,  q12, q15
-+                vand            q2,  q13, q15
-+
-+                blt             2f
-+
-+                vst3.16         {d18, d20, d22}, [r0],  r12
-+                vst3.16         {d19, d21, d23}, [r10]
-+                add             r10, r2,  #24
-+                vst3.16         {d0,  d2,  d4},  [r2],  r12
-+                vst3.16         {d1,  d3,  d5},  [r10]
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r5,  #128
-+                add             r0,  r1
-+                add             r2,  r3
-+                bne             10b
-+
-+                pop             {r4-r10, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r6,  #-12
-+                blt             1f
-+                vst3.16         {d18, d20, d22}, [r0]!
-+                vst3.16         {d0,  d2,  d4},  [r2]!
-+                beq             11b
-+                vmov            d18, d19
-+                vmov            d20, d21
-+                vmov            d22, d23
-+                sub             r6,  #12
-+                vmov            d0,  d1
-+                vmov            d2,  d3
-+                vmov            d4,  d5
-+1:
-+                cmp             r6,  #-18
-+                @ Rezip here as it makes the remaining tail handling easier
-+                vzip.16         d0,  d18
-+                vzip.16         d2,  d20
-+                vzip.16         d4,  d22
-+                blt             1f
-+                vst3.16         {d0[1],  d2[1],  d4[1]},  [r0]!
-+                vst3.16         {d0[0],  d2[0],  d4[0]},  [r2]!
-+                vst3.16         {d0[3],  d2[3],  d4[3]},  [r0]!
-+                vst3.16         {d0[2],  d2[2],  d4[2]},  [r2]!
-+                beq             11b
-+                vmov            d0,  d18
-+                vmov            d2,  d20
-+                sub             r6,  #6
-+                vmov            d4,  d22
-+1:
-+                cmp             r6,  #-21
-+                blt             1f
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r0]!
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r2]!
-+                beq             11b
-+                vmov            s4,  s5
-+                sub             r6,  #3
-+                vmov            s0,  s1
-+1:
-+                cmp             r6,  #-22
-+                blt             1f
-+                vst2.16         {d0[1], d2[1]}, [r0]!
-+                vst2.16         {d0[0], d2[0]}, [r2]!
-+                b               11b
-+1:
-+                vst1.16         {d0[1]}, [r0]!
-+                vst1.16         {d0[0]}, [r2]!
-+                b               11b
-+
-+endfunc
-+
-+@ void ff_rpi_sand30_lines_to_planar_p010(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for writing
-+
-+function ff_rpi_sand30_lines_to_planar_p010, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                vmov.u16        q15, #0xffc0
-+                sub             r3,  #1
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6,  lsl #1
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+                mov             lr,  #0
-+1:
-+                vldm            r2!, {q10-q13}
-+                add             lr,  #64
-+
-+                vshl.u32        q14, q10, #6
-+                ands            lr,  #127
-+                vshrn.u32       d4,  q10, #14
-+                vshrn.u32       d2,  q10, #4
-+                vmovn.u32       d0,  q14
-+
-+                vshl.u32        q14, q11, #6
-+                it              eq
-+                addeq           r2,  r3
-+                vshrn.u32       d5,  q11, #14
-+                vshrn.u32       d3,  q11, #4
-+                vmovn.u32       d1,  q14
-+
-+                subs            r5,  #48
-+                vand            q2,  q15
-+                vand            q1,  q15
-+                vand            q0,  q15
-+
-+                vshl.u32        q14, q12, #6
-+                vshrn.u32       d20, q12, #14
-+                vshrn.u32       d18, q12, #4
-+                vmovn.u32       d16, q14
-+
-+                vshl.u32        q14, q13, #6
-+                vshrn.u32       d21, q13, #14
-+                vshrn.u32       d19, q13, #4
-+                vmovn.u32       d17, q14
-+
-+                vand            q10, q15
-+                vand            q9,  q15
-+                vand            q8,  q15
-+                blt             2f
-+
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4], r12
-+                vst3.16         {d16, d18, d20}, [r0], r12
-+                vst3.16         {d17, d19, d21}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #24-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0], r12
-+                vst3.16         {d1,  d3,  d5},  [r4]
-+                beq             11b
-+                vmov            q0,  q8
-+                sub             r5,  #24
-+                vmov            q1,  q9
-+                vmov            q2,  q10
-+1:
-+                cmp             r5,  #12-48
-+                blt             1f
-+                vst3.16         {d0,  d2,  d4},  [r0]!
-+                beq             11b
-+                vmov            d0, d1
-+                sub             r5, #12
-+                vmov            d2, d3
-+                vmov            d4, d5
-+1:
-+                cmp             r5,  #6-48
-+                add             r4,  r0,  #6    @ avoid [r0]! on sequential instructions
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]
-+                vst3.16         {d0[1], d2[1], d4[1]}, [r4]
-+                add             r0,  #12
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #6
-+                vmov            s4,  s5
-+                vmov            s8,  s9
-+1:
-+                cmp             r5, #3-48
-+                blt             1f
-+                vst3.16         {d0[0], d2[0], d4[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #16
-+                vshr.u32        d2, #16
-+1:
-+                cmp             r5, #2-48
-+                blt             1f
-+                vst2.16         {d0[0], d2[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.16         {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
-+
-+
-diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
-new file mode 100644
-index 0000000000..447f367bea
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.h
-@@ -0,0 +1,99 @@
-+/*
-+Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef AVUTIL_ARM_SAND_NEON_H
-+#define AVUTIL_ARM_SAND_NEON_H
-+
-+void ff_rpi_sand128b_stripe_to_8_10(
-+  uint8_t * dest,             // [r0]
-+  const uint8_t * src1,       // [r1]
-+  const uint8_t * src2,       // [r2]
-+  unsigned int lines);        // [r3]
-+
-+void ff_rpi_sand8_lines_to_planar_y8(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+void ff_rpi_sand8_lines_to_planar_c8(
-+  uint8_t * dst_u,            // [r0]
-+  unsigned int dst_stride_u,  // [r1]
-+  uint8_t * dst_v,            // [r2]
-+  unsigned int dst_stride_v,  // [r3]
-+  const uint8_t * src,        // [sp, #0]  -> r4, r5
-+  unsigned int stride1,       // [sp, #4]  128
-+  unsigned int stride2,       // [sp, #8]  -> r8
-+  unsigned int _x,            // [sp, #12] 0
-+  unsigned int y,             // [sp, #16] (r7 in prefix)
-+  unsigned int _w,            // [sp, #20] -> r12, r6
-+  unsigned int h);            // [sp, #24] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_y16(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_c16(
-+  uint8_t * dst_u,            // [r0]
-+  unsigned int dst_stride_u,  // [r1]
-+  uint8_t * dst_v,            // [r2]
-+  unsigned int dst_stride_v,  // [r3]
-+  const uint8_t * src,        // [sp, #0]  -> r4, r5
-+  unsigned int stride1,       // [sp, #4]  128
-+  unsigned int stride2,       // [sp, #8]  -> r8
-+  unsigned int _x,            // [sp, #12] 0
-+  unsigned int y,             // [sp, #16] (r7 in prefix)
-+  unsigned int _w,            // [sp, #20] -> r6, r9
-+  unsigned int h);            // [sp, #24] -> r7
-+
-+void ff_rpi_sand30_lines_to_planar_p010(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
-+#endif // AVUTIL_ARM_SAND_NEON_H
-+
-diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index 62a2ae08d9..cb73521ea7 100644
---- a/libavutil/pixdesc.c
-+++ b/libavutil/pixdesc.c
-@@ -2717,6 +2717,50 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
-         .flags = AV_PIX_FMT_FLAG_RGB | AV_PIX_FMT_FLAG_FLOAT |
-                  AV_PIX_FMT_FLAG_ALPHA,
-     },
-+    [AV_PIX_FMT_SAND128] = {
-+        .name = "sand128",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 1, 0, 0, 8 },        /* Y */
-+            { 1, 2, 0, 0, 8 },        /* U */
-+            { 1, 2, 1, 0, 8 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_SAND64_10] = {
-+        .name = "sand64_10",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 2, 0, 0, 10 },        /* Y */
-+            { 1, 4, 0, 0, 10 },        /* U */
-+            { 1, 4, 2, 0, 10 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_SAND64_16] = {
-+        .name = "sand64_16",
-+        .nb_components = 3,
-+        .log2_chroma_w = 1,
-+        .log2_chroma_h = 1,
-+        .comp = {
-+            { 0, 2, 0, 0, 16 },        /* Y */
-+            { 1, 4, 0, 0, 16 },        /* U */
-+            { 1, 4, 2, 0, 16 },        /* V */
-+        },
-+        .flags = 0,
-+    },
-+    [AV_PIX_FMT_RPI4_8] = {
-+        .name = "rpi4_8",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
-+    [AV_PIX_FMT_RPI4_10] = {
-+        .name = "rpi4_10",
-+        .flags = AV_PIX_FMT_FLAG_HWACCEL,
-+    },
+@@ -35,6 +37,7 @@ CHECKASMOBJS-$(CONFIG_AVCODEC)          += $(AVCODECOBJS-yes)
+ # libavfilter tests
+ AVFILTEROBJS-$(CONFIG_AFIR_FILTER) += af_afir.o
+ AVFILTEROBJS-$(CONFIG_BLEND_FILTER) += vf_blend.o
++AVFILTEROBJS-$(CONFIG_BWDIF_FILTER)      += vf_bwdif.o
+ AVFILTEROBJS-$(CONFIG_COLORSPACE_FILTER) += vf_colorspace.o
+ AVFILTEROBJS-$(CONFIG_EQ_FILTER)         += vf_eq.o
+ AVFILTEROBJS-$(CONFIG_GBLUR_FILTER)      += vf_gblur.o
+@@ -52,8 +55,9 @@ CHECKASMOBJS-$(CONFIG_SWSCALE)  += $(SWSCALEOBJS)
+ # libavutil tests
+ AVUTILOBJS                              += fixed_dsp.o
+ AVUTILOBJS                              += float_dsp.o
++AVUTILOBJS-$(CONFIG_SAND)               += rpi_sand.o
+ 
+-CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS)
++CHECKASMOBJS-$(CONFIG_AVUTIL)  += $(AVUTILOBJS) $(AVUTILOBJS-yes)
+ 
+ CHECKASMOBJS-$(ARCH_AARCH64)            += aarch64/checkasm.o
+ CHECKASMOBJS-$(HAVE_ARMV5TE_EXTERNAL)   += arm/checkasm.o
+diff --git a/tests/checkasm/checkasm.c b/tests/checkasm/checkasm.c
+index 8338e8ff58..c1ee09c72e 100644
+--- a/tests/checkasm/checkasm.c
++++ b/tests/checkasm/checkasm.c
+@@ -131,6 +131,9 @@ static const struct {
+     #if CONFIG_HUFFYUV_DECODER
+         { "huffyuvdsp", checkasm_check_huffyuvdsp },
+     #endif
++    #if CONFIG_IDCTDSP
++        { "idctdsp", checkasm_check_idctdsp },
++    #endif
+     #if CONFIG_JPEG2000_DECODER
+         { "jpeg2000dsp", checkasm_check_jpeg2000dsp },
+     #endif
+@@ -155,6 +158,9 @@ static const struct {
+     #if CONFIG_V210_ENCODER
+         { "v210enc", checkasm_check_v210enc },
+     #endif
++    #if CONFIG_VC1DSP
++        { "vc1dsp", checkasm_check_vc1dsp },
++    #endif
+     #if CONFIG_VP8DSP
+         { "vp8dsp", checkasm_check_vp8dsp },
+     #endif
+@@ -172,6 +178,9 @@ static const struct {
+     #if CONFIG_BLEND_FILTER
+         { "vf_blend", checkasm_check_blend },
+     #endif
++    #if CONFIG_BWDIF_FILTER
++        { "vf_bwdif", checkasm_check_vf_bwdif },
++    #endif
+     #if CONFIG_COLORSPACE_FILTER
+         { "vf_colorspace", checkasm_check_colorspace },
+     #endif
+@@ -198,6 +207,9 @@ static const struct {
+ #if CONFIG_AVUTIL
+         { "fixed_dsp", checkasm_check_fixed_dsp },
+         { "float_dsp", checkasm_check_float_dsp },
++    #if CONFIG_SAND
++        { "rpi_sand",  checkasm_check_rpi_sand },
++    #endif
+ #endif
+     { NULL }
  };
- 
- static const char * const color_range_names[] = {
-diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 37c2c79e01..22f70007c3 100644
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -377,6 +377,12 @@ enum AVPixelFormat {
- 
-     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
-     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
-+// RPI - not on ifdef so can be got at by calling progs
-+    AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+    AV_PIX_FMT_RPI4_8,
-+    AV_PIX_FMT_RPI4_10,
- 
-     AV_PIX_FMT_X2RGB10LE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), little-endian, X=unused/undefined
-     AV_PIX_FMT_X2RGB10BE, ///< packed RGB 10:10:10, 30bpp, (msb)2X 10R 10G 10B(lsb), big-endian, X=unused/undefined
-diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
+diff --git a/tests/checkasm/checkasm.h b/tests/checkasm/checkasm.h
+index ef6645e3a2..02d3642836 100644
+--- a/tests/checkasm/checkasm.h
++++ b/tests/checkasm/checkasm.h
+@@ -70,12 +70,14 @@ void checkasm_check_hevc_epel_bi(void);
+ void checkasm_check_hevc_epel_bi_w(void);
+ void checkasm_check_hevc_sao(void);
+ void checkasm_check_huffyuvdsp(void);
++void checkasm_check_idctdsp(void);
+ void checkasm_check_jpeg2000dsp(void);
+ void checkasm_check_llviddsp(void);
+ void checkasm_check_llviddspenc(void);
+ void checkasm_check_nlmeans(void);
+ void checkasm_check_opusdsp(void);
+ void checkasm_check_pixblockdsp(void);
++void checkasm_check_rpi_sand(void);
+ void checkasm_check_sbrdsp(void);
+ void checkasm_check_synth_filter(void);
+ void checkasm_check_sw_rgb(void);
+@@ -83,6 +85,8 @@ void checkasm_check_sw_scale(void);
+ void checkasm_check_utvideodsp(void);
+ void checkasm_check_v210dec(void);
+ void checkasm_check_v210enc(void);
++void checkasm_check_vc1dsp(void);
++void checkasm_check_vf_bwdif(void);
+ void checkasm_check_vf_eq(void);
+ void checkasm_check_vf_gblur(void);
+ void checkasm_check_vf_hflip(void);
+diff --git a/tests/checkasm/idctdsp.c b/tests/checkasm/idctdsp.c
 new file mode 100644
-index 0000000000..0324f6826d
+index 0000000000..02724536a7
 --- /dev/null
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -0,0 +1,227 @@
++++ b/tests/checkasm/idctdsp.c
+@@ -0,0 +1,98 @@
 +/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+// * Included twice from rpi_sand_fn with different PW
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x;
-+    const unsigned int w = _w;
-+    const unsigned int mask = stride1 - 1;
-+
-+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
-+    if (_x == 0) {
-+        ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
-+                                     src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
-+            memcpy(dst, p, w);
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const uint8_t * p = p2;
-+            uint8_t * d = dst;
-+            memcpy(d, p1, w1);
-+            d += w1;
-+            for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
-+                memcpy(d, p, stride1);
-+            }
-+            memcpy(d, p, w3);
-+        }
-+    }
-+}
-+
-+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
-+
-+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+
-+#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
-+    if (_x == 0) {
-+        ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
-+                                     src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            const pixel * p = (const pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * p = (const pixel *)p1;
-+            pixel * du = (pixel *)dst_u;
-+            pixel * dv = (pixel *)dst_v;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+            for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *du++ = *p++;
-+                    *dv++ = *p++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *du++ = *p++;
-+                *dv++ = *p++;
-+            }
-+        }
-+    }
-+}
-+
-+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x = _x * 2;
-+    const unsigned int w = _w * 2;
-+    const unsigned int mask = stride1 - 1;
-+    if ((x & ~mask) == ((x + w) & ~mask)) {
-+        // All in one sand stripe
-+        uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        // Two+ stripe
-+        const unsigned int sstride = stride1 * stride2;
-+        const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+        const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+        const uint8_t * p2 = p1 + sstride - (x & mask);
-+        const unsigned int w1 = stride1 - (x & mask);
-+        const unsigned int w3 = (x + w) & mask;
-+        const unsigned int w2 = w - (w1 + w3);
-+
-+        for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
-+            unsigned int j;
-+            const pixel * su = (const pixel *)src_u;
-+            const pixel * sv = (const pixel *)src_v;
-+            pixel * p = (pixel *)p1;
-+            for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+            for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+                for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+                    *p++ = *su++;
-+                    *p++ = *sv++;
-+                }
-+            }
-+            for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+                *p++ = *su++;
-+                *p++ = *sv++;
-+            }
-+        }
-+    }
-+}
-+
-+
-+#undef pixel
-+#undef STRCAT
-+#undef FUNC
-+
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-new file mode 100644
-index 0000000000..ed0261b02f
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,353 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#include "config.h"
-+#include <stdint.h>
-+#include <string.h>
-+#include "rpi_sand_fns.h"
-+#include "avassert.h"
-+#include "frame.h"
-+
-+#if ARCH_ARM && HAVE_NEON
-+#include "arm/rpi_sand_neon.h"
-+#define HAVE_SAND_ASM 1
-+#else
-+#define HAVE_SAND_ASM 0
-+#endif
-+
-+#define PW 1
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#define PW 2
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#if 1
-+// Simple round
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    const unsigned int rnd = (1 << shr) >> 1;
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        *dst++ = (*src++ + rnd) >> shr;
-+    }
-+}
-+#else
-+// Dithered variation
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+    unsigned int rnd = (1 << shr) >> 1;
-+    const unsigned int mask = ((1 << shr) - 1);
-+    const uint16_t * src = (const uint16_t *)_src;
-+
-+    for (; n != 0; --n) {
-+        rnd = *src++ + (rnd & mask);
-+        *dst++ = rnd >> shr;
-+    }
-+}
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+// _x & _w in pixels, strides in bytes
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 4;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_y16(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * d = (uint16_t *)dst;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3 = *p++;
-+
-+            if (xskip0 == 1)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3 = *p++;
-+            *d++ = p3 & 0x3ff;
-+            *d++ = (p3 >> 10) & 0x3ff;
-+            *d++ = (p3 >> 20) & 0x3ff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3 = *p;
-+
-+            *d++ = p3 & 0x3ff;
-+            if (xrem1 == 2)
-+                *d++ = (p3 >> 10) & 0x3ff;
-+        }
-+    }
-+}
-+
-+
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 8; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 3) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 8;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 3) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_c16(dst_u, dst_stride_u, dst_v, dst_stride_v,
-+                                       src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint16_t * du = (uint16_t *)dst_u;
-+        uint16_t * dv = (uint16_t *)dst_v;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            if (xskip0 == 1)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = (p3b >>  0) & 0x3ff;
-+            }
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            *du++ = (p3a >> 20) & 0x3ff;
-+            *dv++ = p3b & 0x3ff;
-+            *du++ = (p3b >> 10) & 0x3ff;
-+            *dv++ = (p3b >> 20) & 0x3ff;
-+
-+            if (((x += 8) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3a = *p++;
-+            const uint32_t p3b = *p++;
-+
-+            *du++ = p3a & 0x3ff;
-+            *dv++ = (p3a >> 10) & 0x3ff;
-+            if (xrem1 == 2)
-+            {
-+                *du++ = (p3a >> 20) & 0x3ff;
-+                *dv++ = p3b & 0x3ff;
-+            }
-+        }
-+    }
-+}
-+
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr)
-+{
-+    const unsigned int n = dst_stride1 / 2;
-+    unsigned int j;
-+
-+    // This is true for our current layouts
-+    av_assert0(dst_stride1 == src_stride1);
-+
-+    // As we have the same stride1 for src & dest and src is wider than dest
-+    // then if we loop on src we can always write contiguously to dest
-+    // We make no effort to copy an exact width - round up to nearest src stripe
-+    // as we will always have storage in dest for that
-+
-+#if ARCH_ARM && HAVE_NEON
-+    if (shr == 3 && src_stride1 == 128) {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            ff_rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
-+        }
-+    }
-+    else
-+#endif
-+    {
-+        for (j = 0; j + n < w; j += dst_stride1) {
-+            uint8_t * d = dst + j * dst_stride2;
-+            const uint8_t * s1 = src + j * 2 * src_stride2;
-+            const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+            for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
-+                cpy16_to_8(d, s1, n, shr);
-+                cpy16_to_8(d + n, s2, n, shr);
-+            }
-+        }
-+    }
-+
-+    // Fix up a trailing dest half stripe
-+    if (j < w) {
-+        uint8_t * d = dst + j * dst_stride2;
-+        const uint8_t * s1 = src + j * 2 * src_stride2;
-+
-+        for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
-+            cpy16_to_8(d, s1, n, shr);
-+        }
-+    }
-+}
-+
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-+{
-+    const int w = av_frame_cropped_width(src);
-+    const int h = av_frame_cropped_height(src);
-+    const int x = src->crop_left;
-+    const int y = src->crop_top;
-+
-+    // We will crop as part of the conversion
-+    dst->crop_top = 0;
-+    dst->crop_left = 0;
-+    dst->crop_bottom = 0;
-+    dst->crop_right = 0;
-+
-+    switch (src->format){
-+        case AV_PIX_FMT_SAND128:
-+        case AV_PIX_FMT_RPI4_8:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2,  w/2, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_SAND64_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x*2, y, w*2, h);
-+                    av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y/2,  w, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        case AV_PIX_FMT_RPI4_10:
-+            switch (dst->format){
-+                case AV_PIX_FMT_YUV420P10:
-+                    av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                             dst->data[2], dst->linesize[2],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w/2, h/2);
-+                    break;
-+                default:
-+                    return -1;
-+            }
-+            break;
-+        default:
-+            return -1;
-+    }
-+
-+    return av_frame_copy_props(dst, src);
-+}
-diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
-new file mode 100644
-index 0000000000..634b55e800
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,183 @@
-+/*
-+Copyright (c) 2018 Raspberry Pi (Trading) Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: John Cox
-+*/
-+
-+#ifndef AVUTIL_RPI_SAND_FNS
-+#define AVUTIL_RPI_SAND_FNS
-+
-+#include "libavutil/frame.h"
-+
-+// For all these fns _x & _w are measured as coord * PW
-+// For the C fns coords are in chroma pels (so luma / 2)
-+// Strides are in bytes
-+
-+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
-+                             unsigned int stride1, unsigned int stride2,
-+                             const uint8_t * src_u, const unsigned int src_stride_u,
-+                             const uint8_t * src_v, const unsigned int src_stride_v,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand30_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+                             uint8_t * dst_v, const unsigned int dst_stride_v,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
-+
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+                         const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+                         unsigned int w, unsigned int h, const unsigned int shr);
-+
-+
-+// dst must contain required pixel format & allocated data buffers
-+// Cropping on the src buffer will be honoured and dst crop will be set to zero
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
-+
-+
-+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
-+{
-+#ifdef RPI_ZC_SAND128_ONLY
-+    // If we are sure we only only support 128 byte sand formats replace the
-+    // var with a constant which should allow for better optimisation
-+    return 128;
-+#else
-+    return frame->linesize[0];
-+#endif
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
-+{
-+    return frame->linesize[3];
-+}
-+
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+    return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand_format(frame->format);
-+}
-+
-+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_SAND128 || frame->format == AV_PIX_FMT_RPI4_8);
-+}
-+
-+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
-+{
-+    return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_is_sand30_frame(const AVFrame * const frame)
-+{
-+    return (frame->format == AV_PIX_FMT_RPI4_10);
-+}
-+
-+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
-+{
-+    return av_rpi_is_sand8_frame(frame) ? 0 : 1;
-+}
-+
-+// If x is measured in bytes (not pixels) then this works for sand64_16 as
-+// well as sand128 - but in the general case we work that out
-+
-+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+    const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+    const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+    const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
-+    const unsigned int x1 = x & (stride1 - 1);
-+    const unsigned int x2 = x ^ x1;
-+
-+    return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+    return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
-+}
-+
-+#endif
-+
-
-From 89b8d6ac2a886749d4594656083753e682de05a7 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 11:36:47 +0100
-Subject: [PATCH 003/136] Add aarch64 asm sand conv functions
-
-Many thanks to eiler.mike@gmail.com (Michael Eiler) for these
-optimizations
----
- libavutil/aarch64/Makefile        |   2 +
- libavutil/aarch64/rpi_sand_neon.S | 676 ++++++++++++++++++++++++++++++
- libavutil/aarch64/rpi_sand_neon.h |  55 +++
- libavutil/rpi_sand_fn_pw.h        |   4 +-
- libavutil/rpi_sand_fns.c          |   3 +
- 5 files changed, 738 insertions(+), 2 deletions(-)
- create mode 100644 libavutil/aarch64/rpi_sand_neon.S
- create mode 100644 libavutil/aarch64/rpi_sand_neon.h
-
-diff --git a/libavutil/aarch64/Makefile b/libavutil/aarch64/Makefile
-index eba0151337..1b44beab39 100644
---- a/libavutil/aarch64/Makefile
-+++ b/libavutil/aarch64/Makefile
-@@ -4,3 +4,5 @@ OBJS += aarch64/cpu.o                                                 \
- 
- NEON-OBJS += aarch64/float_dsp_neon.o                                 \
-              aarch64/tx_float_neon.o                                  \
-+             aarch64/rpi_sand_neon.o                                  \
-+
-diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
-new file mode 100644
-index 0000000000..cdcf71ee67
---- /dev/null
-+++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -0,0 +1,676 @@
-+/*
-+Copyright (c) 2021 Michael Eiler
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: Michael Eiler <eiler.mike@gmail.com>
-+*/
-+
-+#include "asm.S"
-+
-+// void ff_rpi_sand8_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+
-+function ff_rpi_sand8_lines_to_planar_y8, export=1
-+    // w15 contains the number of rows we need to process
-+    ldr w15, [sp, #0]
-+
-+    // w8 will contain the number of blocks per row
-+    // w8 = floor(_w/stride1)
-+    // stride1 is assumed to always be 128
-+    mov w8, w1
-+    lsr w8, w8, #7
-+
-+    // in case the width of the image is not a multiple of 128, there will
-+    // be an incomplete block at the end of every row
-+    // w9 contains the number of pixels stored within this block
-+    // w9 = _w - w8 * 128
-+    lsl w9, w8, #7
-+    sub w9, w7, w9
-+
-+    // this is the value we have to add to the src pointer after reading a complete block
-+    // it will move the address to the start of the next block
-+    // w10 = stride2 * stride1 - stride1 
-+    mov w10, w4
-+    lsl w10, w10, #7
-+    sub w10, w10, #128
-+
-+    // w11 is the row offset, meaning the start offset of the first block of every collumn
-+    // this will be increased with stride1 within every iteration of the row_loop
-+    eor w11, w11, w11
-+
-+    // w12 = 0, processed row count
-+    eor w12, w12, w12
-+row_loop:
-+    // start of the first block within the current row
-+    // x13 = row offset + src
-+    mov x13, x2
-+    add x13, x13, x11
-+
-+    // w14 = 0, processed block count
-+    eor w14, w14, w14
-+
-+    cmp w8, #0
-+    beq no_main_y8
-+
-+block_loop:
-+    // copy 128 bytes (a full block) into the vector registers v0-v7 and increase the src address by 128
-+    // fortunately these aren't callee saved ones, meaning we don't need to backup them
-+    ld1 { v0.16b,  v1.16b,  v2.16b,  v3.16b}, [x13], #64
-+    ld1 { v4.16b,  v5.16b,  v6.16b,  v7.16b}, [x13], #64 
-+
-+    // write these registers back to the destination vector and increase the dst address by 128
-+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
-+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x0], #64
-+
-+    // move the source register to the beginning of the next block (x13 = src + block offset)
-+    add x13, x13, x10
-+    // increase the block counter
-+    add w14, w14, #1
-+
-+    // continue with the block_loop if we haven't copied all full blocks yet
-+    cmp w8, w14
-+    bgt block_loop
-+
-+    // handle the last block at the end of each row
-+    // at most 127 byte values copied from src to dst
-+no_main_y8:
-+    eor w5, w5, w5 // i = 0
-+incomplete_block_loop_y8:
-+    cmp w5, w9
-+    bge incomplete_block_loop_end_y8
-+
-+    ldrb w6, [x13]
-+    strb w6, [x0]
-+    add x13, x13, #1
-+    add x0, x0, #1
-+
-+    add w5, w5, #1
-+    b incomplete_block_loop_y8
-+incomplete_block_loop_end_y8:
-+    
-+   
-+    // increase the row offset by 128 (stride1) 
-+    add w11, w11, #128
-+    // increment the row counter
-+    add w12, w12, #1
-+    
-+    // process the next row if we haven't finished yet
-+    cmp w15, w12
-+    bgt row_loop
-+
-+    ret
-+endfunc
-+
-+
-+
-+// void ff_rpi_sand8_lines_to_planar_c8(
-+//   uint8_t * dst_u,           : x0
-+//   unsigned int dst_stride_u, : w1 == width
-+//   uint8_t * dst_v,           : x2
-+//   unsigned int dst_stride_v, : w3 == width
-+//   const uint8_t * src,       : x4
-+//   unsigned int stride1,      : w5 == 128
-+//   unsigned int stride2,      : w6
-+//   unsigned int _x,           : w7
-+//   unsigned int y,            : [sp, #0]
-+//   unsigned int _w,           : [sp, #8]
-+//   unsigned int h);           : [sp, #16]
-+
-+function ff_rpi_sand8_lines_to_planar_c8, export=1
-+    // w7 = width
-+    ldr w7, [sp, #8]
-+
-+    // w15 contains the number of rows we need to process
-+    // counts down
-+    ldr w15, [sp, #16]
-+
-+    // number of full blocks, w8 = _w / (stride1 >> 1) == _w / 64 == _w >> 6
-+    mov w8, w7
-+    lsr w8, w8, #6
-+
-+    // number of pixels in block at the end of every row
-+    // w9 = _w - (w8 * 64)
-+    lsl w9, w8, #6
-+    sub w9, w7, w9
-+
-+    // Skip at the end of the line to account for stride
-+    sub w12, w1, w7
-+
-+    // address delta to the beginning of the next block
-+    // w10 = (stride2 * stride1 - stride1) = stride2 * 128 - 128
-+    lsl w10, w6, #7
-+    sub w10, w10, #128
-+
-+    // w11 = row address start offset = 0
-+    eor w11, w11, w11
-+
-+row_loop_c8:
-+    // start of the first block within the current row
-+    // x13 = row offset + src
-+    mov x13, x4
-+    add x13, x13, x11
-+
-+    // w14 = 0, processed block count
-+    eor w14, w14, w14
-+
-+    cmp w8, #0
-+    beq no_main_c8
-+
-+block_loop_c8:
-+    // load the full block -> 128 bytes, the block contains 64 interleaved U and V values 
-+    ld2 { v0.16b,  v1.16b }, [x13], #32
-+    ld2 { v2.16b,  v3.16b }, [x13], #32
-+    ld2 { v4.16b,  v5.16b }, [x13], #32
-+    ld2 { v6.16b,  v7.16b }, [x13], #32
-+
-+    // swap register so that we can write them out with a single instruction
-+    mov v16.16b, v1.16b
-+    mov v17.16b, v3.16b
-+    mov v18.16b, v5.16b
-+    mov v1.16b, v2.16b
-+    mov v2.16b, v4.16b
-+    mov v3.16b, v6.16b
-+    mov v4.16b, v16.16b
-+    mov v5.16b, v17.16b
-+    mov v6.16b, v18.16b
-+
-+    st1 { v0.16b,  v1.16b,  v2.16b,  v3.16b }, [x0], #64
-+    st1 { v4.16b,  v5.16b,  v6.16b,  v7.16b }, [x2], #64
-+
-+    // increment row counter and move src to the beginning of the next block
-+    add w14, w14, #1
-+    add x13, x13, x10
-+    
-+    // jump to block_loop_c8 iff the block count is smaller than the number of full blocks
-+    cmp w8, w14
-+    bgt block_loop_c8
-+
-+no_main_c8:
-+    // handle incomplete block at the end of every row
-+    eor w5, w5, w5 // point counter, this might be 
-+incomplete_block_loop_c8:
-+    cmp w5, w9
-+    bge incomplete_block_loop_end_c8
-+
-+    ldrb w1, [x13]
-+    strb w1, [x0]
-+    add x13, x13, #1
-+
-+    ldrb w1, [x13]
-+    strb w1, [x2]
-+    add x13, x13, #1
-+
-+    add x0, x0, #1
-+    add x2, x2, #1
-+
-+    add w5, w5, #1
-+    b incomplete_block_loop_c8
-+incomplete_block_loop_end_c8:
-+
-+    // increase row_offset by stride1
-+    add w11, w11, #128
-+    add x0, x0, w12, sxtw
-+    add x2, x2, w12, sxtw
-+
-+    // jump to row_Loop_c8 iff the row count is small than the height
-+    subs w15, w15, #1
-+    bgt row_loop_c8
-+
-+    ret
-+endfunc
-+
-+//void ff_rpi_sand30_lines_to_planar_y16(
-+//  uint8_t * dest,             // [x0]
-+//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
-+//  const uint8_t * src,        // [x2]
-+//  unsigned int src_stride1,   // [w3] -> 128
-+//  unsigned int src_stride2,   // [w4]
-+//  unsigned int _x,            // [w5]
-+//  unsigned int y,             // [w6]
-+//  unsigned int _w,            // [w7]
-+//  unsigned int h);            // [sp, #0]
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    // w6 = argument h
-+    ldr w6, [sp, #48]
-+
-+    // slice_inc = ((stride2 - 1) * stride1)
-+    mov w5, w4
-+    sub w5, w5, #1
-+    lsl w5, w5, #7
-+
-+    // total number of bytes per row = (width / 3) * 4
-+    mov w8, w7
-+    mov w9, #3
-+    udiv w8, w8, w9
-+    lsl w8, w8, #2
-+
-+    // number of full 128 byte blocks to be processed
-+    mov w9, #96
-+    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
-+
-+    // w10 = number of full integers to process (4 bytes)
-+    // w11 = remaning zero to two 10bit values still to copy over
-+    mov w12, #96
-+    mul w12, w9, w12
-+    sub w12, w7, w12  // width - blocks*96 = remaining points per row
-+    mov w11, #3
-+    udiv w10, w12, w11 // full integers to process = w12 / 3 
-+    mul w11, w10, w11  // #integers *3
-+    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
-+
-+    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
-+    // this is to efficiently copy incomplete blocks at the end of the rows
-+    // the last row is handled explicitly to avoid writing out of bounds
-+    add w22, w10, w11
-+    cmp w22, #0
-+    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
-+    add w9, w9, w22
-+    sub w6, w6, #1
-+
-+    // store the number of bytes in w20 which we copy too much for every row
-+    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
-+    mov w20, #96*2
-+    mul w20, w20, w9
-+    sub w20, w1, w20
-+
-+    mov w23, #0 // flag to check whether the last line had already been processed
-+    
-+    // bitmask to clear the uppper 6bits of the result values
-+    mov x19, #0x03ff03ff03ff03ff
-+    dup v22.2d, x19
-+
-+    // row counter = 0
-+    eor w12, w12, w12
-+row_loop_y16:
-+    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
-+    bge row_loop_y16_fin
-+
-+    mov x13, x2               // row src
-+    eor w14, w14, w14         // full block counter
-+block_loop_y16:
-+    cmp w14, w9
-+    bge block_loop_y16_fin
-+
-+    // load 64 bytes
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+   
-+    // process v0 and v1
-+    xtn v16.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v17.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v18.4h, v0.4s
-+   
-+    xtn2 v16.8h, v1.4s
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v17.8h, v1.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v18.8h, v1.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // process v2 and v3
-+    xtn v23.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v24.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v25.4h, v2.4s
-+    
-+    xtn2 v23.8h, v3.4s
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v24.8h, v3.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v25.8h, v3.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+
-+    // load the second half of the block -> 64 bytes into registers v4-v7
-+    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
-+    
-+    // process v4 and v5
-+    xtn v16.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v17.4h, v4.4s
-+    ushr v4.4s, v4.4s, #10
-+    xtn v18.4h, v4.4s
-+   
-+    xtn2 v16.8h, v5.4s 
-+    and v16.16b, v16.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v17.8h, v5.4s
-+    and v17.16b, v17.16b, v22.16b
-+    ushr v5.4s, v5.4s, #10
-+    xtn2 v18.8h, v5.4s
-+    and v18.16b, v18.16b, v22.16b
-+
-+    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
-+
-+    // v6 and v7
-+    xtn v23.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v24.4h, v6.4s
-+    ushr v6.4s, v6.4s, #10
-+    xtn v25.4h, v6.4s
-+   
-+    xtn2 v23.8h, v7.4s 
-+    and v23.16b, v23.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v24.8h, v7.4s
-+    and v24.16b, v24.16b, v22.16b
-+    ushr v7.4s, v7.4s, #10
-+    xtn2 v25.8h, v7.4s
-+    and v25.16b, v25.16b, v22.16b
-+
-+    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-+ 
-+    add x13, x13, x5          // row src += slice_inc
-+    add w14, w14, #1
-+    b block_loop_y16
-+block_loop_y16_fin:
-+
-+    
-+
-+
-+    add x2, x2, #128          // src += stride1 (start of the next row)
-+    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
-+    add w12, w12, #1
-+    b row_loop_y16
-+row_loop_y16_fin:
-+
-+    // check whether we have incomplete blocks at the end of every row
-+    // in that case decrease row block count by one
-+    // change height back to it's original value (meaning increase it by 1)
-+    // and jump back to another iteration of row_loop_y16
-+
-+    cmp w23, #1
-+    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
-+    add w6, w6, #1    // increase height to the original value
-+    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
-+    mov w23, #1
-+    b row_loop_y16
-+row_loop_y16_fin2:
-+
-+    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
-+
-+    // now we've got to handle the last block in the last row
-+    eor w12, w12, w12 // w12 = 0 = counter
-+integer_loop_y16:
-+    cmp w12, w10
-+    bge integer_loop_y16_fin
-+    ldr w14, [x13], #4
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    add w12, w12, #1
-+    b integer_loop_y16
-+integer_loop_y16_fin:
-+
-+final_values_y16:
-+    // remaining point count = w11
-+    ldr w14, [x13], #4
-+    cmp w11, #0
-+    beq final_values_y16_fin
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+    cmp w11, #1
-+    beq final_values_y16_fin
-+    lsr w14, w14, #10
-+    and w15, w14, #0x3ff
-+    strh w15, [x0], #2
-+final_values_y16_fin:
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
-+//void ff_rpi_sand30_lines_to_planar_c16(
-+//  uint8_t * dst_u,            // [x0]
-+//  unsigned int dst_stride_u,  // [w1] == _w*2
-+//  uint8_t * dst_v,            // [x2]
-+//  unsigned int dst_stride_v,  // [w3] == _w*2
-+//  const uint8_t * src,        // [x4]
-+//  unsigned int stride1,       // [w5] == 128
-+//  unsigned int stride2,       // [w6] 
-+//  unsigned int _x,            // [w7] == 0
-+//  unsigned int y,             // [sp, #0] == 0
-+//  unsigned int _w,            // [sp, #8] -> w3
-+//  unsigned int h);            // [sp, #16] -> w7
-+
-+.macro rpi_sand30_lines_to_planar_c16_block_half
-+    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
-+
-+    xtn v4.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v5.4h, v0.4s
-+    ushr v0.4s, v0.4s, #10
-+    xtn v6.4h, v0.4s
-+    xtn2 v4.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v5.8h, v1.4s
-+    ushr v1.4s, v1.4s, #10
-+    xtn2 v6.8h, v1.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp], #48
-+    
-+    xtn v4.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v5.4h, v2.4s
-+    ushr v2.4s, v2.4s, #10
-+    xtn v6.4h, v2.4s
-+    xtn2 v4.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v5.8h, v3.4s
-+    ushr v3.4s, v3.4s, #10
-+    xtn2 v6.8h, v3.4s
-+    and v4.16b, v4.16b, v16.16b
-+    and v5.16b, v5.16b, v16.16b
-+    and v6.16b, v6.16b, v16.16b
-+    st3 { v4.8h, v5.8h, v6.8h }, [sp]
-+    sub sp, sp, #48
-+.endm
-+
-+function ff_rpi_sand30_lines_to_planar_c16, export=1
-+    stp x19, x20, [sp, #-48]!
-+    stp x21, x22, [sp, #16]
-+    stp x23, x24, [sp, #32]
-+
-+    ldr w3, [sp, #48+8]    // w3 = width
-+    ldr w7, [sp, #48+16]   // w7 = height
-+
-+    // reserve space on the stack for intermediate results
-+    sub sp, sp, #256
-+
-+    // number of 128byte blocks per row, w8 = width / 48
-+    mov w9, #48
-+    udiv w8, w3, w9
-+
-+    // remaining pixels (rem_pix) per row, w9 = width - w8 * 48
-+    mul w9, w8, w9
-+    sub w9, w3, w9
-+
-+    // row offset, the beginning of the next row to process
-+    eor w10, w10, w10
-+
-+    // offset to the beginning of the next block, w11 = stride2 * 128 - 128
-+    lsl w11, w6, #7
-+    sub w11, w11, #128
-+
-+    // decrease the height by one and in case of remaining pixels increase the block count by one
-+    sub w7, w7, #1
-+    cmp w9, #0
-+    cset w19, ne    // w19 == 1 iff reamining pixels != 0
-+    add w8, w8, w19
-+
-+    // bytes we have to move dst back by at the end of every row
-+    mov w21, #48*2
-+    mul w21, w21, w8
-+    sub w21, w1, w21
-+
-+    mov w20, #0     // w20 = flag, last row processed
-+
-+    mov x12, #0x03ff03ff03ff03ff
-+    dup v16.2d, x12
-+
-+    // iterate through rows, row counter = w12 = 0
-+    eor w12, w12, w12
-+row_loop_c16:
-+    cmp w12, w7
-+    bge row_loop_c16_fin
-+
-+    // address of row data = src + row_offset
-+    mov x13, x4
-+    add x13, x13, x10
-+
-+    eor w14, w14, w14
-+block_loop_c16:
-+    cmp w14, w8
-+    bge block_loop_c16_fin
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    rpi_sand30_lines_to_planar_c16_block_half
-+
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #64
-+
-+    st1 { v0.8h }, [x0], #16
-+    st1 { v2.8h }, [x0], #16
-+    st1 { v4.8h }, [x0], #16
-+    st1 { v1.8h }, [x2], #16
-+    st1 { v3.8h }, [x2], #16
-+    st1 { v5.8h }, [x2], #16
-+
-+    add x13, x13, x11 // offset to next block
-+    add w14, w14, #1
-+    b block_loop_c16
-+block_loop_c16_fin:
-+
-+    add w10, w10, #128
-+    add w12, w12, #1
-+    add x0, x0, w21, sxtw  // move dst pointers back by x21
-+    add x2, x2, w21, sxtw
-+    b row_loop_c16
-+row_loop_c16_fin:
-+
-+    cmp w20, #1
-+    beq row_loop_c16_fin2
-+    mov w20, #1
-+    sub w8, w8, w19 // decrease block count by w19
-+    add w7, w7, #1 // increase height
-+    b row_loop_c16
-+
-+row_loop_c16_fin2:
-+    sub x0, x0, w21, sxtw // readd x21 in case of the last row
-+    sub x2, x2, w21, sxtw // so that we can write out the few remaining pixels
-+
-+    // last incomplete block to be finished
-+    // read operations are fine, stride2 is more than large enough even if rem_pix is 0
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp], #32
-+    rpi_sand30_lines_to_planar_c16_block_half
-+    ld2 { v0.8h, v1.8h }, [sp], #32
-+    ld2 { v2.8h, v3.8h }, [sp], #32
-+    ld2 { v4.8h, v5.8h }, [sp]
-+    sub sp, sp, #160
-+
-+    mov x4, sp
-+    eor w20, w20, w20
-+rem_pix_c16_loop:
-+    cmp w20, w9
-+    bge rem_pix_c16_fin
-+
-+    ldr w22, [x4], #4
-+    str w22, [x0], #2
-+    lsr w22, w22, #16
-+    str w22, [x2], #2 
-+
-+    add w20, w20, #1
-+    b rem_pix_c16_loop
-+rem_pix_c16_fin:
-+
-+    add sp, sp, #256
-+
-+    ldp x23, x24, [sp, #32]
-+    ldp x21, x22, [sp, #16]
-+    ldp x19, x20, [sp], #48
-+    ret
-+endfunc
-+
-+
-+
-+//void ff_rpi_sand30_lines_to_planar_p010(
-+//  uint8_t * dest,
-+//  unsigned int dst_stride,
-+//  const uint8_t * src,
-+//  unsigned int src_stride1,
-+//  unsigned int src_stride2,
-+//  unsigned int _x,
-+//  unsigned int y,
-+//  unsigned int _w,
-+//  unsigned int h);
-+
-diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
-new file mode 100644
-index 0000000000..b3aa481ea4
---- /dev/null
-+++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -0,0 +1,55 @@
-+/*
-+Copyright (c) 2021 Michael Eiler
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+    * Redistributions of source code must retain the above copyright
-+      notice, this list of conditions and the following disclaimer.
-+    * Redistributions in binary form must reproduce the above copyright
-+      notice, this list of conditions and the following disclaimer in the
-+      documentation and/or other materials provided with the distribution.
-+    * Neither the name of the copyright holder nor the
-+      names of its contributors may be used to endorse or promote products
-+      derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+
-+Authors: Michael Eiler <eiler.mike@gmail.com>
-+*/
-+
-+#pragma once
-+
-+#ifdef __cplusplus
-+extern "C" {
-+#endif
-+
-+void ff_rpi_sand8_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand8_lines_to_planar_c8(uint8_t * dst_u, unsigned int dst_stride_u,
-+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src,
-+  unsigned int stride1, unsigned int stride2, unsigned int _x, unsigned int y,
-+  unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand30_lines_to_planar_y16(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_u,
-+  uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
-+  unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
-+#ifdef __cplusplus
-+}
-+#endif
-+
-diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
-index 0324f6826d..0d5d203dc3 100644
---- a/libavutil/rpi_sand_fn_pw.h
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -54,7 +54,7 @@ void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-     const unsigned int w = _w;
-     const unsigned int mask = stride1 - 1;
- 
--#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
-+#if PW == 1 && HAVE_SAND_ASM
-     if (_x == 0) {
-         ff_rpi_sand8_lines_to_planar_y8(dst, dst_stride,
-                                      src, stride1, stride2, _x, y, _w, h);
-@@ -106,7 +106,7 @@ void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_strid
-     const unsigned int w = _w * 2;
-     const unsigned int mask = stride1 - 1;
- 
--#if PW == 1 && (HAVE_SAND_ASM || HAVE_SAND_ASM64)
-+#if PW == 1 && HAVE_SAND_ASM
-     if (_x == 0) {
-         ff_rpi_sand8_lines_to_planar_c8(dst_u, dst_stride_u, dst_v, dst_stride_v,
-                                      src, stride1, stride2, _x, y, _w, h);
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-index ed0261b02f..1f543e9357 100644
---- a/libavutil/rpi_sand_fns.c
-+++ b/libavutil/rpi_sand_fns.c
-@@ -37,6 +37,9 @@ Authors: John Cox
- #if ARCH_ARM && HAVE_NEON
- #include "arm/rpi_sand_neon.h"
- #define HAVE_SAND_ASM 1
-+#elif ARCH_AARCH64 && HAVE_NEON
-+#include "aarch64/rpi_sand_neon.h"
-+#define HAVE_SAND_ASM 1
- #else
- #define HAVE_SAND_ASM 0
- #endif
-
-From 247025a42ae09d6c9c5d4128a5e4b288b7b3047c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 11:56:02 +0100
-Subject: [PATCH 004/136] Add raw encoding for sand
-
----
- libavcodec/raw.c    |  6 +++
- libavcodec/rawenc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-
- 2 files changed, 96 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index 1e5b48d1e0..1e689f9ee0 100644
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -295,6 +295,12 @@ static const PixelFormatTag raw_pix_fmt_tags[] = {
-     { AV_PIX_FMT_RGB565LE,MKTAG( 3 ,  0 ,  0 ,  0 ) }, /* flipped RGB565LE */
-     { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
- 
-+    /* RPI (Might as well define for everything) */
-+    { AV_PIX_FMT_SAND128,     MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_RPI4_8,      MKTAG('S', 'A', 'N', 'D') },
-+    { AV_PIX_FMT_SAND64_10,   MKTAG('S', 'N', 'D', 'A') },
-+    { AV_PIX_FMT_RPI4_10,     MKTAG('S', 'N', 'D', 'B') },
-+
-     { AV_PIX_FMT_NONE, 0 },
- };
- 
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index 8c577006d9..594a77c42a 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -24,6 +24,7 @@
-  * Raw Video Encoder
-  */
- 
-+#include "config.h"
- #include "avcodec.h"
- #include "codec_internal.h"
- #include "encode.h"
-@@ -33,6 +34,10 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -46,12 +51,95 @@ static av_cold int raw_encode_init(AVCodecContext *avctx)
-     return 0;
- }
- 
-+#if CONFIG_SAND
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3 / 2;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height;
-+    av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+    dst += width * height * 2;
-+    av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+    return 0;
-+}
-+
-+static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+                      const AVFrame *frame)
-+{
-+    const int width = av_frame_cropped_width(frame);
-+    const int height = av_frame_cropped_height(frame);
-+    const int x0 = frame->crop_left;
-+    const int y0 = frame->crop_top;
-+    const int size = width * height * 3;
-+    uint8_t * dst;
-+    int ret;
-+
-+    if ((ret = ff_get_encode_buffer(avctx, pkt, size, 0)) < 0)
-+        return ret;
-+
-+    dst = pkt->data;
-+
-+    av_rpi_sand30_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+    dst += width * height * 2;
-+    av_rpi_sand30_to_planar_c16(dst, width, dst + width * height / 2, width,
-+                          frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0/2, y0 / 2, width/2, height / 2);
-+    return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-                       const AVFrame *frame, int *got_packet)
- {
--    int ret = av_image_get_buffer_size(frame->format,
--                                       frame->width, frame->height, 1);
-+    int ret;
- 
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(frame)) {
-+        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
-+            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
-+            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
-+        *got_packet = (ret == 0);
-+        return ret;
-+    }
-+#endif
-+
-+    ret = av_image_get_buffer_size(frame->format,
-+                                       frame->width, frame->height, 1);
-     if (ret < 0)
-         return ret;
- 
-
-From ac6961f424b56563dc793b6bc002a8c04cb1bc36 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 12:02:09 +0100
-Subject: [PATCH 005/136] Deal with the lack of trivial sand cropping
-
----
- fftools/ffmpeg.c        |  4 ++--
- fftools/ffmpeg_filter.c |  4 ++--
- libavutil/frame.c       | 11 +++++++++++
- libavutil/frame.h       | 10 ++++++++++
- 4 files changed, 25 insertions(+), 4 deletions(-)
-
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index d721a5e721..15e084f0b2 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -1993,8 +1993,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
-                        av_channel_layout_compare(&ifilter->ch_layout, &frame->ch_layout);
-         break;
-     case AVMEDIA_TYPE_VIDEO:
--        need_reinit |= ifilter->width  != frame->width ||
--                       ifilter->height != frame->height;
-+        need_reinit |= ifilter->width  != av_frame_cropped_width(frame) ||
-+                       ifilter->height != av_frame_cropped_height(frame);
-         break;
-     }
- 
-diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
-index 1f5bbf6c4d..f888307762 100644
---- a/fftools/ffmpeg_filter.c
-+++ b/fftools/ffmpeg_filter.c
-@@ -1281,8 +1281,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
- 
-     ifilter->format = frame->format;
- 
--    ifilter->width               = frame->width;
--    ifilter->height              = frame->height;
-+    ifilter->width               = av_frame_cropped_width(frame);
-+    ifilter->height              = av_frame_cropped_height(frame);
-     ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
- 
-     ifilter->sample_rate         = frame->sample_rate;
-diff --git a/libavutil/frame.c b/libavutil/frame.c
-index 9545477acc..48621e4098 100644
---- a/libavutil/frame.c
-+++ b/libavutil/frame.c
-@@ -16,6 +16,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "config.h"
-+
- #include "channel_layout.h"
- #include "avassert.h"
- #include "buffer.h"
-@@ -27,6 +29,9 @@
- #include "mem.h"
- #include "samplefmt.h"
- #include "hwcontext.h"
-+#if CONFIG_SAND
-+#include "rpi_sand_fns.h"
-+#endif
- 
- #if FF_API_OLD_CHANNEL_LAYOUT
- #define CHECK_CHANNELS_CONSISTENCY(frame) \
-@@ -874,6 +879,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
-         (frame->crop_top + frame->crop_bottom) >= frame->height)
-         return AVERROR(ERANGE);
- 
-+#if CONFIG_SAND
-+    // Sand cannot be cropped - do not try
-+    if (av_rpi_is_sand_format(frame->format))
-+        return 0;
-+#endif
-+
-     desc = av_pix_fmt_desc_get(frame->format);
-     if (!desc)
-         return AVERROR_BUG;
-diff --git a/libavutil/frame.h b/libavutil/frame.h
-index 2580269549..3a9d323325 100644
---- a/libavutil/frame.h
-+++ b/libavutil/frame.h
-@@ -957,6 +957,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
-  */
- const char *av_frame_side_data_name(enum AVFrameSideDataType type);
- 
-+
-+static inline int av_frame_cropped_width(const AVFrame * const frame)
-+{
-+    return frame->width - (frame->crop_left + frame->crop_right);
-+}
-+static inline int av_frame_cropped_height(const AVFrame * const frame)
-+{
-+    return frame->height - (frame->crop_top + frame->crop_bottom);
-+}
-+
- /**
-  * @}
-  */
-
-From 9a08431f7790507b0374d9585dfc736000c1bd42 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 12:31:16 +0100
-Subject: [PATCH 006/136] Add an unsand filter
-
----
- configure                |   1 +
- libavfilter/Makefile     |   1 +
- libavfilter/allfilters.c |   1 +
- libavfilter/buffersrc.c  |   2 +-
- libavfilter/vf_unsand.c  | 228 +++++++++++++++++++++++++++++++++++++++
- 5 files changed, 232 insertions(+), 1 deletion(-)
- create mode 100644 libavfilter/vf_unsand.c
-
-diff --git a/configure b/configure
-index 27112ced58..7712482bd5 100755
---- a/configure
-+++ b/configure
-@@ -3754,6 +3754,7 @@ tonemap_opencl_filter_deps="opencl const_nan"
- transpose_opencl_filter_deps="opencl"
- transpose_vaapi_filter_deps="vaapi VAProcPipelineCaps_rotation_flags"
- transpose_vulkan_filter_deps="vulkan spirv_compiler"
-+unsand_filter_select="sand"
- unsharp_opencl_filter_deps="opencl"
- uspp_filter_deps="gpl avcodec"
- vaguedenoiser_filter_deps="gpl"
-diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index b3d3d981dd..c14fc995a0 100644
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -518,6 +518,7 @@ OBJS-$(CONFIG_TRANSPOSE_VAAPI_FILTER)        += vf_transpose_vaapi.o vaapi_vpp.o
- OBJS-$(CONFIG_TRANSPOSE_VULKAN_FILTER)       += vf_transpose_vulkan.o vulkan.o vulkan_filter.o
- OBJS-$(CONFIG_TRIM_FILTER)                   += trim.o
- OBJS-$(CONFIG_UNPREMULTIPLY_FILTER)          += vf_premultiply.o framesync.o
-+OBJS-$(CONFIG_UNSAND_FILTER)                 += vf_unsand.o
- OBJS-$(CONFIG_UNSHARP_FILTER)                += vf_unsharp.o
- OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER)         += vf_unsharp_opencl.o opencl.o \
-                                                 opencl/unsharp.o
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index d7db46c2af..b990a00152 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -490,6 +490,7 @@ extern const AVFilter ff_vf_trim;
- extern const AVFilter ff_vf_unpremultiply;
- extern const AVFilter ff_vf_unsharp;
- extern const AVFilter ff_vf_unsharp_opencl;
-+extern const AVFilter ff_vf_unsand;
- extern const AVFilter ff_vf_untile;
- extern const AVFilter ff_vf_uspp;
- extern const AVFilter ff_vf_v360;
-diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
-index ba17450b93..0dbe5d2335 100644
---- a/libavfilter/buffersrc.c
-+++ b/libavfilter/buffersrc.c
-@@ -201,7 +201,7 @@ FF_ENABLE_DEPRECATION_WARNINGS
- 
-         switch (ctx->outputs[0]->type) {
-         case AVMEDIA_TYPE_VIDEO:
--            CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
-+            CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
-                                      frame->format, frame->pts);
-             break;
-         case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
-new file mode 100644
-index 0000000000..7100f2fc9b
---- /dev/null
-+++ b/libavfilter/vf_unsand.c
-@@ -0,0 +1,228 @@
-+/*
-+ * Copyright (c) 2007 Bobby Bingham
++ * Copyright (c) 2022 Ben Avison
 + *
 + * This file is part of FFmpeg.
 + *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
 + *
 + * FFmpeg is distributed in the hope that it will be useful,
 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
 + *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * format and noformat video filters
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 + */
 +
 +#include <string.h>
 +
-+#include "libavutil/internal.h"
-+#include "libavutil/mem.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/rpi_sand_fns.h"
++#include "checkasm.h"
 +
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
++#include "libavcodec/idctdsp.h"
 +
-+typedef struct UnsandContext {
-+    const AVClass *class;
-+} UnsandContext;
-+
-+static av_cold void uninit(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+}
-+
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+
-+    return 0;
-+}
-+
-+
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterLink * const outlink = link->dst->outputs[0];
-+    AVFrame *out = NULL;
-+    int rv = 0;
-+
-+    if (outlink->format == in->format) {
-+        // If nothing to do then do nothing
-+        out = in;
-+    }
-+    else
-+    {
-+        if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
-+        {
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+        if (av_rpi_sand_to_planar_frame(out, in) != 0)
-+        {
-+            rv = -1;
-+            goto fail;
-+        }
-+
-+        av_frame_free(&in);
-+    }
-+
-+    return ff_filter_frame(outlink, out);
-+
-+fail:
-+    av_frame_free(&out);
-+    av_frame_free(&in);
-+    return rv;
-+}
-+
-+#if 0
-+static void dump_fmts(const AVFilterFormats * fmts)
-+{
-+    int i;
-+    if (fmts== NULL) {
-+        printf("NULL\n");
-+        return;
-+    }
-+    for (i = 0; i < fmts->nb_formats; ++i) {
-+        printf(" %d", fmts->formats[i]);
-+    }
-+    printf("\n");
-+}
-+#endif
-+
-+static int query_formats(AVFilterContext *ctx)
-+{
-+//    UnsandContext *s = ctx->priv;
-+    int ret;
-+
-+    // If we aren't connected at both ends then just do nothing
-+    if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
-+        return 0;
-+
-+    // Our output formats depend on our input formats and we can't/don't
-+    // want to convert between bit depths so we need to wait for the source
-+    // to have an opinion before we do
-+    if (ctx->inputs[0]->incfg.formats == NULL)
-+        return AVERROR(EAGAIN);
-+
-+    // Accept anything
-+    if (ctx->inputs[0]->outcfg.formats == NULL &&
-+        (ret = ff_formats_ref(ctx->inputs[0]->incfg.formats, &ctx->inputs[0]->outcfg.formats)) < 0)
-+        return ret;
-+
-+    // Filter out sand formats
-+
-+    // Generate a container if we don't already have one
-+    if (ctx->outputs[0]->incfg.formats == NULL)
-+    {
-+        // Somewhat rubbish way of ensuring we have a good structure
-+        const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
-+        AVFilterFormats *formats = ff_make_format_list(out_fmts);
-+
-+        if (formats == NULL)
-+            return AVERROR(ENOMEM);
-+        if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->incfg.formats)) < 0)
-+            return ret;
-+    }
-+
-+    // Replace old format list with new filtered list derived from what our
-+    // input says it can do
-+    {
-+        const AVFilterFormats * const src_ff = ctx->inputs[0]->outcfg.formats;
-+        AVFilterFormats * const dst_ff = ctx->outputs[0]->incfg.formats;
-+        enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
-+        int i;
-+        int n = 0;
-+        int seen_420p = 0;
-+        int seen_420p10 = 0;
-+
-+        for (i = 0; i < src_ff->nb_formats; ++i) {
-+            const enum AVPixelFormat f = src_ff->formats[i];
-+
-+            switch (f){
-+                case AV_PIX_FMT_YUV420P:
-+                case AV_PIX_FMT_SAND128:
-+                case AV_PIX_FMT_RPI4_8:
-+                    if (!seen_420p) {
-+                        seen_420p = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P;
-+                    }
-+                    break;
-+                case AV_PIX_FMT_SAND64_10:
-+                case AV_PIX_FMT_YUV420P10:
-+                case AV_PIX_FMT_RPI4_10:
-+                    if (!seen_420p10) {
-+                        seen_420p10 = 1;
-+                        dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
-+                    }
-+                    break;
-+                default:
-+                    dst_fmts[n++] = f;
-+                    break;
-+            }
-+        }
-+
-+        av_freep(&dst_ff->formats);
-+        dst_ff->formats = dst_fmts;
-+        dst_ff->nb_formats = n;
-+    }
-+
-+//    printf("Unsand: %s calc: ", __func__);
-+//    dump_fmts(ctx->outputs[0]->incfg.formats);
-+
-+    return 0;
-+}
-+
-+
-+#define OFFSET(x) offsetof(UnsandContext, x)
-+static const AVOption unsand_options[] = {
-+    { NULL }
-+};
-+
-+
-+AVFILTER_DEFINE_CLASS(unsand);
-+
-+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
-+    {
-+        .name             = "default",
-+        .type             = AVMEDIA_TYPE_VIDEO,
-+        .filter_frame = filter_frame,
-+    },
-+    { NULL }
-+};
-+
-+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
-+    {
-+        .name = "default",
-+        .type = AVMEDIA_TYPE_VIDEO
-+    },
-+};
-+
-+AVFilter ff_vf_unsand = {
-+    .name          = "unsand",
-+    .description   = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
-+
-+    .init          = init,
-+    .uninit        = uninit,
-+
-+    FILTER_QUERY_FUNC(query_formats),
-+
-+    .priv_size     = sizeof(UnsandContext),
-+    .priv_class    = &unsand_class,
-+
-+    FILTER_INPUTS(avfilter_vf_unsand_inputs),
-+    FILTER_OUTPUTS(avfilter_vf_unsand_outputs),
-+};
-+
-
-From 6e61007b19544c573f1c2a4c6060d3d24b8d500e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 12:37:07 +0100
-Subject: [PATCH 007/136] Reduce mmal compile warnings
-
----
- libavcodec/mmaldec.c | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 3092f58510..6f41b41ac4 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
-  * MMAL Video Decoder
-  */
- 
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
- 
- #include "avcodec.h"
-
-From 01aff455665e8f889330519096912ad0005add3c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 17:56:16 +0100
-Subject: [PATCH 008/136] Add chroma location to hevc parse
-
----
- libavcodec/hevc_parser.c | 13 +++++++++++++
- libavcodec/hevcdec.c     | 13 +++++++++++++
- 2 files changed, 26 insertions(+)
-
-diff --git a/libavcodec/hevc_parser.c b/libavcodec/hevc_parser.c
-index 59f9a0ff3e..4ae7222e8b 100644
---- a/libavcodec/hevc_parser.c
-+++ b/libavcodec/hevc_parser.c
-@@ -97,6 +97,19 @@ static int hevc_parse_slice_header(AVCodecParserContext *s, H2645NAL *nal,
-     avctx->profile  = ps->sps->ptl.general_ptl.profile_idc;
-     avctx->level    = ps->sps->ptl.general_ptl.level_idc;
- 
-+    if (ps->sps->chroma_format_idc == 1) {
-+        avctx->chroma_sample_location = ps->sps->vui.common.chroma_loc_info_present_flag ?
-+            ps->sps->vui.common.chroma_sample_loc_type_top_field + 1 :
-+            AVCHROMA_LOC_LEFT;
-+    }
-+    else if (ps->sps->chroma_format_idc == 2 ||
-+             ps->sps->chroma_format_idc == 3) {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
-+    }
-+    else {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
-+    }
-+
-     if (ps->vps->vps_timing_info_present_flag) {
-         num = ps->vps->vps_num_units_in_tick;
-         den = ps->vps->vps_time_scale;
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 567e8d81d4..b6cfea64d3 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -347,6 +347,19 @@ static void export_stream_params(HEVCContext *s, const HEVCSPS *sps)
-     else
-         avctx->color_range = AVCOL_RANGE_MPEG;
- 
-+    if (sps->chroma_format_idc == 1) {
-+        avctx->chroma_sample_location = sps->vui.common.chroma_loc_info_present_flag ?
-+            sps->vui.common.chroma_sample_loc_type_top_field + 1 :
-+            AVCHROMA_LOC_LEFT;
-+    }
-+    else if (sps->chroma_format_idc == 2 ||
-+             sps->chroma_format_idc == 3) {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;;
-+    }
-+    else {
-+        avctx->chroma_sample_location = AVCHROMA_LOC_UNSPECIFIED;
-+    }
-+
-     if (sps->vui.common.colour_description_present_flag) {
-         avctx->color_primaries = sps->vui.common.colour_primaries;
-         avctx->color_trc       = sps->vui.common.transfer_characteristics;
-
-From c80aad5d2fb373f7564e4257b1272f2decb06dd0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 26 Sep 2022 18:20:50 +0100
-Subject: [PATCH 009/136] hwaccel: Add .abort_frame & use in hevcdec
-
----
- libavcodec/avcodec.h | 11 +++++++++++
- libavcodec/hevcdec.c |  7 ++++++-
- 2 files changed, 17 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index 39881a1d2b..32bc78e2be 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2221,6 +2221,17 @@ typedef struct AVHWAccel {
-      * that avctx->hwaccel_priv_data is invalid.
-      */
-     int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+
-+    /**
-+     * Called if parsing fails
-+     *
-+     * An error has occured, end_frame will not be called
-+     * start_frame & decode_slice may or may not have been called
-+     * Optional
-+     *
-+     * @param avctx the codec context
-+     */
-+    void (*abort_frame)(AVCodecContext *avctx);
- } AVHWAccel;
- 
- /**
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index b6cfea64d3..8a0246fa21 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -3375,8 +3375,13 @@ static int hevc_decode_frame(AVCodecContext *avctx, AVFrame *rframe,
- 
-     s->ref = NULL;
-     ret    = decode_nal_units(s, avpkt->data, avpkt->size);
--    if (ret < 0)
-+    if (ret < 0) {
-+        // Ensure that hwaccel knows this frame is over
-+        if (s->avctx->hwaccel && s->avctx->hwaccel->abort_frame)
-+            s->avctx->hwaccel->abort_frame(s->avctx);
-+
-         return ret;
-+    }
- 
-     if (avctx->hwaccel) {
-         if (s->ref && (ret = avctx->hwaccel->end_frame(avctx)) < 0) {
-
-From 317722fd652d9a1c1700319c80fc71acf68ddde6 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 26 Sep 2022 18:26:17 +0100
-Subject: [PATCH 010/136] hwaccel: Add CAP_MT_SAFE for accels that can use
- multi-thread
-
----
- libavcodec/hwconfig.h      | 1 +
- libavcodec/pthread_frame.c | 7 +++++--
- 2 files changed, 6 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
-index 721424912c..c43ad55245 100644
---- a/libavcodec/hwconfig.h
-+++ b/libavcodec/hwconfig.h
-@@ -24,6 +24,7 @@
- 
- 
- #define HWACCEL_CAP_ASYNC_SAFE      (1 << 0)
-+#define HWACCEL_CAP_MT_SAFE         (1 << 1)
- 
- 
- typedef struct AVCodecHWConfigInternal {
-diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
-index d9d5afaa82..2cc89a41f5 100644
---- a/libavcodec/pthread_frame.c
-+++ b/libavcodec/pthread_frame.c
-@@ -204,7 +204,8 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
- 
-         /* if the previous thread uses hwaccel then we take the lock to ensure
-          * the threads don't run concurrently */
--        if (avctx->hwaccel) {
-+        if (avctx->hwaccel &&
-+            !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-             pthread_mutex_lock(&p->parent->hwaccel_mutex);
-             p->hwaccel_serializing = 1;
-         }
-@@ -590,7 +591,9 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
- 
-     if (!(avctx->active_thread_type&FF_THREAD_FRAME)) return;
- 
--    if (avctx->hwaccel && !p->hwaccel_serializing) {
-+    if (avctx->hwaccel &&
-+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
-+        !p->hwaccel_serializing) {
-         pthread_mutex_lock(&p->parent->hwaccel_mutex);
-         p->hwaccel_serializing = 1;
-     }
-
-From 9005b263450e154a5ec5258fda17d5998fe7896b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 17:59:08 +0100
-Subject: [PATCH 011/136] Weak link utils
-
----
- libavcodec/weak_link.c | 102 +++++++++++++++++++++++++++++++++++++++++
- libavcodec/weak_link.h |  23 ++++++++++
- 2 files changed, 125 insertions(+)
- create mode 100644 libavcodec/weak_link.c
- create mode 100644 libavcodec/weak_link.h
-
-diff --git a/libavcodec/weak_link.c b/libavcodec/weak_link.c
-new file mode 100644
-index 0000000000..f234a985b9
---- /dev/null
-+++ b/libavcodec/weak_link.c
-@@ -0,0 +1,102 @@
-+#include <stdlib.h>
-+#include <pthread.h>
-+#include <stdatomic.h>
-+#include "weak_link.h"
-+
-+struct ff_weak_link_master {
-+    atomic_int ref_count;    /* 0 is single ref for easier atomics */
-+    pthread_rwlock_t lock;
-+    void * ptr;
-+};
-+
-+static inline struct ff_weak_link_master * weak_link_x(struct ff_weak_link_client * c)
-+{
-+    return (struct ff_weak_link_master *)c;
-+}
-+
-+struct ff_weak_link_master * ff_weak_link_new(void * p)
-+{
-+    struct ff_weak_link_master * w = malloc(sizeof(*w));
-+    if (!w)
-+        return NULL;
-+    w->ptr = p;
-+    if (pthread_rwlock_init(&w->lock, NULL)) {
-+        free(w);
-+        return NULL;
-+    }
-+    return w;
-+}
-+
-+static void weak_link_do_unref(struct ff_weak_link_master * const w)
-+{
-+    int n = atomic_fetch_sub(&w->ref_count, 1);
-+    if (n)
-+        return;
-+
-+    pthread_rwlock_destroy(&w->lock);
-+    free(w);
-+}
-+
-+// Unref & break link
-+void ff_weak_link_break(struct ff_weak_link_master ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = *ppLink;
-+    if (!w)
-+        return;
-+
-+    *ppLink = NULL;
-+    pthread_rwlock_wrlock(&w->lock);
-+    w->ptr = NULL;
-+    pthread_rwlock_unlock(&w->lock);
-+
-+    weak_link_do_unref(w);
-+}
-+
-+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w)
-+{
-+    if (!w)
-+        return NULL;
-+    atomic_fetch_add(&w->ref_count, 1);
-+    return (struct ff_weak_link_client*)w;
-+}
-+
-+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
-+    if (!w)
-+        return;
-+
-+    *ppLink = NULL;
-+    weak_link_do_unref(w);
-+}
-+
-+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(*ppLink);
-+
-+    if (!w)
-+        return NULL;
-+
-+    if (pthread_rwlock_rdlock(&w->lock))
-+        goto broken;
-+
-+    if (w->ptr)
-+        return w->ptr;
-+
-+    pthread_rwlock_unlock(&w->lock);
-+
-+broken:
-+    *ppLink = NULL;
-+    weak_link_do_unref(w);
-+    return NULL;
-+}
-+
-+// Ignores a NULL c (so can be on the return path of both broken & live links)
-+void ff_weak_link_unlock(struct ff_weak_link_client * c)
-+{
-+    struct ff_weak_link_master * const w = weak_link_x(c);
-+    if (w)
-+        pthread_rwlock_unlock(&w->lock);
-+}
-+
-+
-diff --git a/libavcodec/weak_link.h b/libavcodec/weak_link.h
-new file mode 100644
-index 0000000000..415b6a27a0
---- /dev/null
-+++ b/libavcodec/weak_link.h
-@@ -0,0 +1,23 @@
-+struct ff_weak_link_master;
-+struct ff_weak_link_client;
-+
-+struct ff_weak_link_master * ff_weak_link_new(void * p);
-+void ff_weak_link_break(struct ff_weak_link_master ** ppLink);
-+
-+struct ff_weak_link_client* ff_weak_link_ref(struct ff_weak_link_master * w);
-+void ff_weak_link_unref(struct ff_weak_link_client ** ppLink);
-+
-+// Returns NULL if link broken - in this case it will also zap
-+//   *ppLink and unref the weak_link.
-+// Returns NULL if *ppLink is NULL (so a link once broken stays broken)
-+//
-+// The above does mean that there is a race if this is called simultainiously
-+// by two threads using the same weak_link_client (so don't do that)
-+void * ff_weak_link_lock(struct ff_weak_link_client ** ppLink);
-+void ff_weak_link_unlock(struct ff_weak_link_client * c);
-+
-+
-+
-+
-+
-+
-
-From 824be1710ca96d97c86836fdac0e7dcd28a4b92e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 19:23:26 +0100
-Subject: [PATCH 012/136] Add v4l2_req V4L2 request H265 drm_prime decode
-
-Has the abiliy to switch between kernel API versions at runtime. This
-could be removed later once teher is no chance of usage on an old
-kernel.
----
- configure                       |   14 +
- libavcodec/Makefile             |    4 +
- libavcodec/hevc-ctrls-v1.h      |  229 +++++
- libavcodec/hevc-ctrls-v2.h      |  257 +++++
- libavcodec/hevcdec.c            |   10 +
- libavcodec/hwaccels.h           |    1 +
- libavcodec/hwconfig.h           |    2 +
- libavcodec/v4l2_req_decode_q.c  |   84 ++
- libavcodec/v4l2_req_decode_q.h  |   25 +
- libavcodec/v4l2_req_devscan.c   |  449 +++++++++
- libavcodec/v4l2_req_devscan.h   |   23 +
- libavcodec/v4l2_req_dmabufs.c   |  266 ++++++
- libavcodec/v4l2_req_dmabufs.h   |   40 +
- libavcodec/v4l2_req_hevc_v1.c   |    3 +
- libavcodec/v4l2_req_hevc_v2.c   |    3 +
- libavcodec/v4l2_req_hevc_vx.c   | 1213 +++++++++++++++++++++++
- libavcodec/v4l2_req_media.c     | 1596 +++++++++++++++++++++++++++++++
- libavcodec/v4l2_req_media.h     |  151 +++
- libavcodec/v4l2_req_pollqueue.c |  361 +++++++
- libavcodec/v4l2_req_pollqueue.h |   18 +
- libavcodec/v4l2_req_utils.h     |   27 +
- libavcodec/v4l2_request_hevc.c  |  297 ++++++
- libavcodec/v4l2_request_hevc.h  |  102 ++
- 23 files changed, 5175 insertions(+)
- create mode 100644 libavcodec/hevc-ctrls-v1.h
- create mode 100644 libavcodec/hevc-ctrls-v2.h
- create mode 100644 libavcodec/v4l2_req_decode_q.c
- create mode 100644 libavcodec/v4l2_req_decode_q.h
- create mode 100644 libavcodec/v4l2_req_devscan.c
- create mode 100644 libavcodec/v4l2_req_devscan.h
- create mode 100644 libavcodec/v4l2_req_dmabufs.c
- create mode 100644 libavcodec/v4l2_req_dmabufs.h
- create mode 100644 libavcodec/v4l2_req_hevc_v1.c
- create mode 100644 libavcodec/v4l2_req_hevc_v2.c
- create mode 100644 libavcodec/v4l2_req_hevc_vx.c
- create mode 100644 libavcodec/v4l2_req_media.c
- create mode 100644 libavcodec/v4l2_req_media.h
- create mode 100644 libavcodec/v4l2_req_pollqueue.c
- create mode 100644 libavcodec/v4l2_req_pollqueue.h
- create mode 100644 libavcodec/v4l2_req_utils.h
- create mode 100644 libavcodec/v4l2_request_hevc.c
- create mode 100644 libavcodec/v4l2_request_hevc.h
-
-diff --git a/configure b/configure
-index 7712482bd5..199aa2b3d5 100755
---- a/configure
-+++ b/configure
-@@ -281,6 +281,7 @@ External library support:
-                            if openssl, gnutls or mbedtls is not used [no]
-   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
-+  --enable-libudev         enable libudev [no]
-   --enable-libv4l2         enable libv4l2/v4l-utils [no]
-   --enable-libvidstab      enable video stabilization using vid.stab [no]
-   --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -351,6 +352,7 @@ External library support:
-   --enable-omx-rpi         enable OpenMAX IL code for Raspberry Pi [no]
-   --enable-rkmpp           enable Rockchip Media Process Platform code [no]
-   --disable-v4l2-m2m       disable V4L2 mem2mem code [autodetect]
-+  --enable-v4l2-request    enable V4L2 request API code [no]
-   --disable-vaapi          disable Video Acceleration API (mainly Unix/Intel) code [autodetect]
-   --disable-vdpau          disable Nvidia Video Decode and Presentation API for Unix code [autodetect]
-   --disable-videotoolbox   disable VideoToolbox code [autodetect]
-@@ -1858,6 +1860,7 @@ EXTERNAL_LIBRARY_LIST="
-     libtheora
-     libtwolame
-     libuavs3d
-+    libudev
-     libv4l2
-     libvmaf
-     libvorbis
-@@ -1914,6 +1917,7 @@ HWACCEL_LIBRARY_LIST="
-     mmal
-     omx
-     opencl
-+    v4l2_request
- "
- 
- DOCUMENT_LIST="
-@@ -3002,6 +3006,7 @@ d3d11va_deps="dxva_h ID3D11VideoDecoder ID3D11VideoContext"
- dxva2_deps="dxva2api_h DXVA2_ConfigPictureDecode ole32 user32"
- ffnvcodec_deps_any="libdl LoadLibrary"
- nvdec_deps="ffnvcodec"
-+v4l2_request_deps="linux_videodev2_h linux_media_h v4l2_timeval_to_ns libdrm libudev"
- vaapi_x11_deps="xlib_x11"
- videotoolbox_hwaccel_deps="videotoolbox pthreads"
- videotoolbox_hwaccel_extralibs="-framework QuartzCore"
-@@ -3045,6 +3050,8 @@ hevc_dxva2_hwaccel_deps="dxva2 DXVA_PicParams_HEVC"
- hevc_dxva2_hwaccel_select="hevc_decoder"
- hevc_nvdec_hwaccel_deps="nvdec"
- hevc_nvdec_hwaccel_select="hevc_decoder"
-+hevc_v4l2request_hwaccel_deps="v4l2_request"
-+hevc_v4l2request_hwaccel_select="hevc_decoder"
- hevc_vaapi_hwaccel_deps="vaapi VAPictureParameterBufferHEVC"
- hevc_vaapi_hwaccel_select="hevc_decoder"
- hevc_vdpau_hwaccel_deps="vdpau VdpPictureInfoHEVC"
-@@ -6696,6 +6703,7 @@ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
- enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode
-+enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
-@@ -6798,6 +6806,10 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
-                                { enabled libdrm ||
-                                  die "ERROR: rkmpp requires --enable-libdrm"; }
-                              }
-+enabled v4l2_request      && { enabled libdrm ||
-+                               die "ERROR: v4l2-request requires --enable-libdrm"; } &&
-+                             { enabled libudev ||
-+                               die "ERROR: v4l2-request requires --enable-libudev"; }
- enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
- 
- 
-@@ -6880,6 +6892,8 @@ if enabled v4l2_m2m; then
-     check_cc vp9_v4l2_m2m linux/videodev2.h "int i = V4L2_PIX_FMT_VP9;"
- fi
- 
-+check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
-+check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
- 
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 389253f5d0..2d440b5648 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -170,6 +170,8 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
- OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
-+OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
-+					  v4l2_req_devscan.o weak_link.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
- OBJS-$(CONFIG_WMV2DSP)                 += wmv2dsp.o
- 
-@@ -996,6 +998,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
-+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-diff --git a/libavcodec/hevc-ctrls-v1.h b/libavcodec/hevc-ctrls-v1.h
-new file mode 100644
-index 0000000000..72cbba0953
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v1.h
-@@ -0,0 +1,229 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_MPEG_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_MPEG_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_MPEG_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_MPEG_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_MPEG_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_MPEG_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT		(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	num_active_dpb_entries;
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	num_rps_poc_st_curr_before;
-+	__u8	num_rps_poc_st_curr_after;
-+	__u8	num_rps_poc_lt_curr;
-+
-+	__u8	padding;
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	struct v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
-diff --git a/libavcodec/hevc-ctrls-v2.h b/libavcodec/hevc-ctrls-v2.h
-new file mode 100644
-index 0000000000..7cbbbf055f
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v2.h
-@@ -0,0 +1,257 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__u8	num_ref_idx_l0_default_active_minus1;
-+	__u8	num_ref_idx_l1_default_active_minus1;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE	0x01
-+#define V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER	0x02
-+#define V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR		0x03
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	rps;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	padding[5];
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-+
-+struct v4l2_ctrl_hevc_decode_params {
-+	__s32	pic_order_cnt_val;
-+	__u8	num_active_dpb_entries;
-+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	num_poc_st_curr_before;
-+	__u8	num_poc_st_curr_after;
-+	__u8	num_poc_lt_curr;
-+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u64	flags;
-+};
-+
-+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
-+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
-+/*
-+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
-+ * the number of data (in bits) to skip in the
-+ * slice segment header.
-+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
-+ * to before syntax element "slice_temporal_mvp_enabled_flag".
-+ * If IDR, the skipped bits are just "pic_output_flag"
-+ * (separate_colour_plane_flag is not supported).
-+ */
-+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 8a0246fa21..2867cb2e16 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -416,6 +416,7 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #define HWACCEL_MAX (CONFIG_HEVC_DXVA2_HWACCEL + \
-                      CONFIG_HEVC_D3D11VA_HWACCEL * 2 + \
-                      CONFIG_HEVC_NVDEC_HWACCEL + \
-+                     CONFIG_HEVC_V4L2REQUEST_HWACCEL + \
-                      CONFIG_HEVC_VAAPI_HWACCEL + \
-                      CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL + \
-                      CONFIG_HEVC_VDPAU_HWACCEL)
-@@ -442,6 +443,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_VIDEOTOOLBOX_HWACCEL
-         *fmt++ = AV_PIX_FMT_VIDEOTOOLBOX;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV420P10:
-@@ -463,6 +467,9 @@ static enum AVPixelFormat get_format(HEVCContext *s, const HEVCSPS *sps)
- #endif
- #if CONFIG_HEVC_NVDEC_HWACCEL
-         *fmt++ = AV_PIX_FMT_CUDA;
-+#endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+        *fmt++ = AV_PIX_FMT_DRM_PRIME;
- #endif
-         break;
-     case AV_PIX_FMT_YUV444P:
-@@ -3749,6 +3756,9 @@ const FFCodec ff_hevc_decoder = {
- #if CONFIG_HEVC_NVDEC_HWACCEL
-                                HWACCEL_NVDEC(hevc),
- #endif
-+#if CONFIG_HEVC_V4L2REQUEST_HWACCEL
-+                               HWACCEL_V4L2REQUEST(hevc),
-+#endif
- #if CONFIG_HEVC_VAAPI_HWACCEL
-                                HWACCEL_VAAPI(hevc),
- #endif
-diff --git a/libavcodec/hwaccels.h b/libavcodec/hwaccels.h
-index aca55831f3..f32d1c4ec4 100644
---- a/libavcodec/hwaccels.h
-+++ b/libavcodec/hwaccels.h
-@@ -40,6 +40,7 @@ extern const AVHWAccel ff_hevc_d3d11va_hwaccel;
- extern const AVHWAccel ff_hevc_d3d11va2_hwaccel;
- extern const AVHWAccel ff_hevc_dxva2_hwaccel;
- extern const AVHWAccel ff_hevc_nvdec_hwaccel;
-+extern const AVHWAccel ff_hevc_v4l2request_hwaccel;
- extern const AVHWAccel ff_hevc_vaapi_hwaccel;
- extern const AVHWAccel ff_hevc_vdpau_hwaccel;
- extern const AVHWAccel ff_hevc_videotoolbox_hwaccel;
-diff --git a/libavcodec/hwconfig.h b/libavcodec/hwconfig.h
-index c43ad55245..b8aa383071 100644
---- a/libavcodec/hwconfig.h
-+++ b/libavcodec/hwconfig.h
-@@ -71,6 +71,8 @@ typedef struct AVCodecHWConfigInternal {
-     HW_CONFIG_HWACCEL(1, 1, 0, D3D11,        D3D11VA,      ff_ ## codec ## _d3d11va2_hwaccel)
- #define HWACCEL_NVDEC(codec) \
-     HW_CONFIG_HWACCEL(1, 1, 0, CUDA,         CUDA,         ff_ ## codec ## _nvdec_hwaccel)
-+#define HWACCEL_V4L2REQUEST(codec) \
-+    HW_CONFIG_HWACCEL(1, 0, 0, DRM_PRIME,    DRM,          ff_ ## codec ## _v4l2request_hwaccel)
- #define HWACCEL_VAAPI(codec) \
-     HW_CONFIG_HWACCEL(1, 1, 1, VAAPI,        VAAPI,        ff_ ## codec ## _vaapi_hwaccel)
- #define HWACCEL_VDPAU(codec) \
-diff --git a/libavcodec/v4l2_req_decode_q.c b/libavcodec/v4l2_req_decode_q.c
-new file mode 100644
-index 0000000000..5b3fb958fa
---- /dev/null
-+++ b/libavcodec/v4l2_req_decode_q.c
-@@ -0,0 +1,84 @@
-+#include <memory.h>
-+#include <semaphore.h>
-+#include <pthread.h>
-+
-+#include "v4l2_req_decode_q.h"
-+
-+int decode_q_in_q(const req_decode_ent * const d)
-+{
-+    return d->in_q;
-+}
-+
-+void decode_q_add(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    pthread_mutex_lock(&q->q_lock);
-+    if (!q->head) {
-+        q->head = d;
-+        q->tail = d;
-+        d->prev = NULL;
-+    }
-+    else {
-+        q->tail->next = d;
-+        d->prev = q->tail;
-+        q->tail = d;
-+    }
-+    d->next = NULL;
-+    d->in_q = 1;
-+    pthread_mutex_unlock(&q->q_lock);
-+}
-+
-+// Remove entry from Q - if head wake-up anything that was waiting
-+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    int try_signal = 0;
-+
-+    if (!d->in_q)
-+        return;
-+
-+    pthread_mutex_lock(&q->q_lock);
-+    if (d->prev)
-+        d->prev->next = d->next;
-+    else {
-+        try_signal = 1;  // Only need to signal if we were head
-+        q->head = d->next;
-+    }
-+
-+    if (d->next)
-+        d->next->prev = d->prev;
-+    else
-+        q->tail = d->prev;
-+
-+    // Not strictly needed but makes debug easier
-+    d->next = NULL;
-+    d->prev = NULL;
-+    d->in_q = 0;
-+    pthread_mutex_unlock(&q->q_lock);
-+
-+    if (try_signal)
-+        pthread_cond_broadcast(&q->q_cond);
-+}
-+
-+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d)
-+{
-+    pthread_mutex_lock(&q->q_lock);
-+
-+    while (q->head != d)
-+        pthread_cond_wait(&q->q_cond, &q->q_lock);
-+
-+    pthread_mutex_unlock(&q->q_lock);
-+}
-+
-+void decode_q_uninit(req_decode_q * const q)
-+{
-+    pthread_mutex_destroy(&q->q_lock);
-+    pthread_cond_destroy(&q->q_cond);
-+}
-+
-+void decode_q_init(req_decode_q * const q)
-+{
-+    memset(q, 0, sizeof(*q));
-+    pthread_mutex_init(&q->q_lock, NULL);
-+    pthread_cond_init(&q->q_cond, NULL);
-+}
-+
-+
-diff --git a/libavcodec/v4l2_req_decode_q.h b/libavcodec/v4l2_req_decode_q.h
-new file mode 100644
-index 0000000000..af7bbe1de4
---- /dev/null
-+++ b/libavcodec/v4l2_req_decode_q.h
-@@ -0,0 +1,25 @@
-+#ifndef AVCODEC_V4L2_REQ_DECODE_Q_H
-+#define AVCODEC_V4L2_REQ_DECODE_Q_H
-+
-+typedef struct req_decode_ent {
-+    struct req_decode_ent * next;
-+    struct req_decode_ent * prev;
-+    int in_q;
-+} req_decode_ent;
-+
-+typedef struct req_decode_q {
-+    pthread_mutex_t q_lock;
-+    pthread_cond_t q_cond;
-+    req_decode_ent * head;
-+    req_decode_ent * tail;
-+} req_decode_q;
-+
-+int decode_q_in_q(const req_decode_ent * const d);
-+void decode_q_add(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_remove(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_wait(req_decode_q * const q, req_decode_ent * const d);
-+void decode_q_uninit(req_decode_q * const q);
-+void decode_q_init(req_decode_q * const q);
-+
-+#endif
-+
-diff --git a/libavcodec/v4l2_req_devscan.c b/libavcodec/v4l2_req_devscan.c
-new file mode 100644
-index 0000000000..cfa94d55c4
---- /dev/null
-+++ b/libavcodec/v4l2_req_devscan.c
-@@ -0,0 +1,449 @@
-+#include <errno.h>
-+#include <fcntl.h>
-+#include <libudev.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <unistd.h>
-+
-+#include <sys/ioctl.h>
-+#include <sys/sysmacros.h>
-+
-+#include <linux/media.h>
-+#include <linux/videodev2.h>
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_utils.h"
-+
-+struct decdev {
-+    enum v4l2_buf_type src_type;
-+    uint32_t src_fmt_v4l2;
-+    const char * vname;
-+    const char * mname;
-+};
-+
-+struct devscan {
-+    struct decdev env;
-+    unsigned int dev_size;
-+    unsigned int dev_count;
-+    struct decdev *devs;
-+};
-+
-+static int video_src_pixfmt_supported(uint32_t fmt)
-+{
-+    return 1;
-+}
-+
-+static void v4l2_setup_format(struct v4l2_format *format, unsigned int type,
-+                  unsigned int width, unsigned int height,
-+                  unsigned int pixelformat)
-+{
-+    unsigned int sizeimage;
-+
-+    memset(format, 0, sizeof(*format));
-+    format->type = type;
-+
-+    sizeimage = V4L2_TYPE_IS_OUTPUT(type) ? 4 * 1024 * 1024 : 0;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(type)) {
-+        format->fmt.pix_mp.width = width;
-+        format->fmt.pix_mp.height = height;
-+        format->fmt.pix_mp.plane_fmt[0].sizeimage = sizeimage;
-+        format->fmt.pix_mp.pixelformat = pixelformat;
-+    } else {
-+        format->fmt.pix.width = width;
-+        format->fmt.pix.height = height;
-+        format->fmt.pix.sizeimage = sizeimage;
-+        format->fmt.pix.pixelformat = pixelformat;
-+    }
-+}
-+
-+static int v4l2_set_format(int video_fd, unsigned int type, unsigned int pixelformat,
-+            unsigned int width, unsigned int height)
-+{
-+    struct v4l2_format format;
-+
-+    v4l2_setup_format(&format, type, width, height, pixelformat);
-+
-+    return ioctl(video_fd, VIDIOC_S_FMT, &format) ? -errno : 0;
-+}
-+
-+static int v4l2_query_capabilities(int video_fd, unsigned int *capabilities)
-+{
-+    struct v4l2_capability capability = { 0 };
-+    int rc;
-+
-+    rc = ioctl(video_fd, VIDIOC_QUERYCAP, &capability);
-+    if (rc < 0)
-+        return -errno;
-+
-+    if (capabilities != NULL) {
-+        if ((capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0)
-+            *capabilities = capability.device_caps;
-+        else
-+            *capabilities = capability.capabilities;
-+    }
-+
-+    return 0;
-+}
-+
-+static int devscan_add(struct devscan *const scan,
-+                       enum v4l2_buf_type src_type,
-+                       uint32_t src_fmt_v4l2,
-+                       const char * vname,
-+                       const char * mname)
-+{
-+    struct decdev *d;
-+
-+    if (scan->dev_size <= scan->dev_count) {
-+        unsigned int n = !scan->dev_size ? 4 : scan->dev_size * 2;
-+        d = realloc(scan->devs, n * sizeof(*d));
-+        if (!d)
-+            return -ENOMEM;
-+        scan->devs = d;
-+        scan->dev_size = n;
-+    }
-+
-+    d = scan->devs + scan->dev_count;
-+    d->src_type = src_type;
-+    d->src_fmt_v4l2 = src_fmt_v4l2;
-+    d->vname = strdup(vname);
-+    if (!d->vname)
-+        return -ENOMEM;
-+    d->mname = strdup(mname);
-+    if (!d->mname) {
-+        free((char *)d->vname);
-+        return -ENOMEM;
-+    }
-+    ++scan->dev_count;
-+    return 0;
-+}
-+
-+void devscan_delete(struct devscan **const pScan)
-+{
-+    unsigned int i;
-+    struct devscan * const scan = *pScan;
-+
-+    if (!scan)
-+        return;
-+    *pScan = NULL;
-+
-+    for (i = 0; i < scan->dev_count; ++i) {
-+        free((char*)scan->devs[i].mname);
-+        free((char*)scan->devs[i].vname);
-+    }
-+    free(scan->devs);
-+    free(scan);
-+}
-+
-+#define REQ_BUF_CAPS (\
-+    V4L2_BUF_CAP_SUPPORTS_DMABUF |\
-+    V4L2_BUF_CAP_SUPPORTS_REQUESTS |\
-+    V4L2_BUF_CAP_SUPPORTS_M2M_HOLD_CAPTURE_BUF)
-+
-+static void probe_formats(void * const dc,
-+              struct devscan *const scan,
-+              const int fd,
-+              const unsigned int type_v4l2,
-+              const char *const mpath,
-+              const char *const vpath)
-+{
-+    unsigned int i;
-+    for (i = 0;; ++i) {
-+        struct v4l2_fmtdesc fmtdesc = {
-+            .index = i,
-+            .type = type_v4l2
-+        };
-+        struct v4l2_requestbuffers rbufs = {
-+            .count = 0,
-+            .type = type_v4l2,
-+            .memory = V4L2_MEMORY_MMAP
-+        };
-+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
-+            if (errno == EINTR)
-+                continue;
-+            if (errno != EINVAL)
-+                request_err(dc, "Enum[%d] failed for type=%d\n", i, type_v4l2);
-+            return;
-+        }
-+        if (!video_src_pixfmt_supported(fmtdesc.pixelformat))
-+            continue;
-+
-+        if (v4l2_set_format(fd, type_v4l2, fmtdesc.pixelformat, 720, 480)) {
-+            request_debug(dc, "Set failed for type=%d, pf=%.4s\n", type_v4l2, (char*)&fmtdesc.pixelformat);
-+            continue;
-+        }
-+
-+        while (ioctl(fd, VIDIOC_REQBUFS, &rbufs)) {
-+            if (errno != EINTR) {
-+                request_debug(dc, "%s: Reqbufs failed\n", vpath);
-+                continue;
-+            }
-+        }
-+
-+        if ((rbufs.capabilities & REQ_BUF_CAPS) != REQ_BUF_CAPS) {
-+            request_debug(dc, "%s: Buf caps %#x insufficient\n", vpath, rbufs.capabilities);
-+            continue;
-+        }
-+
-+        request_debug(dc, "Adding: %s,%s pix=%#x, type=%d\n",
-+                 mpath, vpath, fmtdesc.pixelformat, type_v4l2);
-+        devscan_add(scan, type_v4l2, fmtdesc.pixelformat, vpath, mpath);
-+    }
-+}
-+
-+
-+static int probe_video_device(void * const dc,
-+                   struct udev_device *const device,
-+                   struct devscan *const scan,
-+                   const char *const mpath)
-+{
-+    int ret;
-+    unsigned int capabilities = 0;
-+    int video_fd = -1;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        request_err(dc, "%s: get video device devnode failed\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    video_fd = open(path, O_RDWR, 0);
-+    if (video_fd == -1) {
-+        ret = -errno;
-+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(errno), errno);
-+        goto fail;
-+    }
-+
-+    ret = v4l2_query_capabilities(video_fd, &capabilities);
-+    if (ret < 0) {
-+        request_err(dc, "%s: get video capability failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    request_debug(dc, "%s: path=%s capabilities=%#x\n", __func__, path, capabilities);
-+
-+    if (!(capabilities & V4L2_CAP_STREAMING)) {
-+        request_debug(dc, "%s: missing required streaming capability\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    if (!(capabilities & (V4L2_CAP_VIDEO_M2M_MPLANE | V4L2_CAP_VIDEO_M2M))) {
-+        request_debug(dc, "%s: missing required mem2mem capability\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    /* Should check capture formats too... */
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M) != 0)
-+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT, mpath, path);
-+    if ((capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) != 0)
-+        probe_formats(dc, scan, video_fd, V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, mpath, path);
-+
-+    close(video_fd);
-+    return 0;
-+
-+fail:
-+    if (video_fd >= 0)
-+        close(video_fd);
-+    return ret;
-+}
-+
-+static int probe_media_device(void * const dc,
-+                   struct udev_device *const device,
-+                   struct devscan *const scan)
-+{
-+    int ret;
-+    int rv;
-+    struct media_device_info device_info = { 0 };
-+    struct media_v2_topology topology = { 0 };
-+    struct media_v2_interface *interfaces = NULL;
-+    struct udev *udev = udev_device_get_udev(device);
-+    struct udev_device *video_device;
-+    dev_t devnum;
-+    int media_fd = -1;
-+
-+    const char *path = udev_device_get_devnode(device);
-+    if (!path) {
-+        request_err(dc, "%s: get media device devnode failed\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    media_fd = open(path, O_RDWR, 0);
-+    if (media_fd < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: opening %s failed, %s (%d)\n", __func__, path, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    rv = ioctl(media_fd, MEDIA_IOC_DEVICE_INFO, &device_info);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media device info failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    if (topology.num_interfaces <= 0) {
-+        request_err(dc, "%s: media device has no interfaces\n", __func__);
-+        ret = -EINVAL;
-+        goto fail;
-+    }
-+
-+    interfaces = calloc(topology.num_interfaces, sizeof(*interfaces));
-+    if (!interfaces) {
-+        request_err(dc, "%s: allocating media interface struct failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    topology.ptr_interfaces = (__u64)(uintptr_t)interfaces;
-+    rv = ioctl(media_fd, MEDIA_IOC_G_TOPOLOGY, &topology);
-+    if (rv < 0) {
-+        ret = -errno;
-+        request_err(dc, "%s: get media topology failed, %s (%d)\n", __func__, strerror(-ret), -ret);
-+        goto fail;
-+    }
-+
-+    for (int i = 0; i < topology.num_interfaces; i++) {
-+        if (interfaces[i].intf_type != MEDIA_INTF_T_V4L_VIDEO)
-+            continue;
-+
-+        devnum = makedev(interfaces[i].devnode.major, interfaces[i].devnode.minor);
-+        video_device = udev_device_new_from_devnum(udev, 'c', devnum);
-+        if (!video_device) {
-+            ret = -errno;
-+            request_err(dc, "%s: video_device[%d]=%p\n", __func__, i, video_device);
-+            continue;
-+        }
-+
-+        ret = probe_video_device(dc, video_device, scan, path);
-+        udev_device_unref(video_device);
-+
-+        if (ret != 0)
-+            goto fail;
-+    }
-+
-+fail:
-+    free(interfaces);
-+    if (media_fd != -1)
-+        close(media_fd);
-+    return ret;
-+}
-+
-+const char *decdev_media_path(const struct decdev *const dev)
-+{
-+    return !dev ? NULL : dev->mname;
-+}
-+
-+const char *decdev_video_path(const struct decdev *const dev)
-+{
-+    return !dev ? NULL : dev->vname;
-+}
-+
-+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev)
-+{
-+    return !dev ? 0 : dev->src_type;
-+}
-+
-+uint32_t decdev_src_pixelformat(const struct decdev *const dev)
-+{
-+    return !dev ? 0 : dev->src_fmt_v4l2;
-+}
-+
-+
-+const struct decdev *devscan_find(struct devscan *const scan,
-+                  const uint32_t src_fmt_v4l2)
-+{
-+    unsigned int i;
-+
-+    if (scan->env.mname && scan->env.vname)
-+        return &scan->env;
-+
-+    if (!src_fmt_v4l2)
-+        return scan->dev_count ? scan->devs + 0 : NULL;
-+
-+    for (i = 0; i != scan->dev_count; ++i) {
-+        if (scan->devs[i].src_fmt_v4l2 == src_fmt_v4l2)
-+            return scan->devs + i;
-+    }
-+    return NULL;
-+}
-+
-+int devscan_build(void * const dc, struct devscan **pscan)
-+{
-+    int ret;
-+    struct udev *udev;
-+    struct udev_enumerate *enumerate;
-+    struct udev_list_entry *devices;
-+    struct udev_list_entry *entry;
-+    struct udev_device *device;
-+    struct devscan * scan;
-+
-+    *pscan = NULL;
-+
-+    scan = calloc(1, sizeof(*scan));
-+    if (!scan) {
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    scan->env.mname = getenv("LIBVA_V4L2_REQUEST_MEDIA_PATH");
-+    scan->env.vname = getenv("LIBVA_V4L2_REQUEST_VIDEO_PATH");
-+    if (scan->env.mname && scan->env.vname) {
-+        request_info(dc, "Media/video device env overrides found: %s,%s\n",
-+                 scan->env.mname, scan->env.vname);
-+        *pscan = scan;
-+        return 0;
-+    }
-+
-+    udev = udev_new();
-+    if (!udev) {
-+        request_err(dc, "%s: allocating udev context failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    enumerate = udev_enumerate_new(udev);
-+    if (!enumerate) {
-+        request_err(dc, "%s: allocating udev enumerator failed\n", __func__);
-+        ret = -ENOMEM;
-+        goto fail;
-+    }
-+
-+    udev_enumerate_add_match_subsystem(enumerate, "media");
-+    udev_enumerate_scan_devices(enumerate);
-+
-+    devices = udev_enumerate_get_list_entry(enumerate);
-+    udev_list_entry_foreach(entry, devices) {
-+        const char *path = udev_list_entry_get_name(entry);
-+        if (!path)
-+            continue;
-+
-+        device = udev_device_new_from_syspath(udev, path);
-+        if (!device)
-+            continue;
-+
-+        probe_media_device(dc, device, scan);
-+        udev_device_unref(device);
-+    }
-+
-+    udev_enumerate_unref(enumerate);
-+
-+    *pscan = scan;
-+    return 0;
-+
-+fail:
-+    udev_unref(udev);
-+    devscan_delete(&scan);
-+    return ret;
-+}
-+
-diff --git a/libavcodec/v4l2_req_devscan.h b/libavcodec/v4l2_req_devscan.h
-new file mode 100644
-index 0000000000..956d9234f1
---- /dev/null
-+++ b/libavcodec/v4l2_req_devscan.h
-@@ -0,0 +1,23 @@
-+#ifndef _DEVSCAN_H_
-+#define _DEVSCAN_H_
-+
-+#include <stdint.h>
-+
-+struct devscan;
-+struct decdev;
-+enum v4l2_buf_type;
-+
-+/* These return pointers to data in the devscan structure and so are vaild
-+ * for the lifetime of that
-+ */
-+const char *decdev_media_path(const struct decdev *const dev);
-+const char *decdev_video_path(const struct decdev *const dev);
-+enum v4l2_buf_type decdev_src_type(const struct decdev *const dev);
-+uint32_t decdev_src_pixelformat(const struct decdev *const dev);
-+
-+const struct decdev *devscan_find(struct devscan *const scan, const uint32_t src_fmt_v4l2);
-+
-+int devscan_build(void * const dc, struct devscan **pscan);
-+void devscan_delete(struct devscan **const pScan);
-+
-+#endif
-diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
-new file mode 100644
-index 0000000000..ae6c648369
---- /dev/null
-+++ b/libavcodec/v4l2_req_dmabufs.c
-@@ -0,0 +1,266 @@
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <unistd.h>
-+#include <inttypes.h>
-+#include <fcntl.h>
-+#include <errno.h>
-+#include <string.h>
-+#include <sys/ioctl.h>
-+#include <sys/mman.h>
-+#include <linux/mman.h>
-+#include <linux/dma-buf.h>
-+#include <linux/dma-heap.h>
-+
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_utils.h"
-+
-+#define DMABUF_NAME1  "/dev/dma_heap/linux,cma"
-+#define DMABUF_NAME2  "/dev/dma_heap/reserved"
-+
-+#define TRACE_ALLOC 0
-+
-+struct dmabufs_ctl {
-+    int fd;
-+    size_t page_size;
-+};
-+
-+struct dmabuf_h {
-+    int fd;
-+    size_t size;
-+    size_t len;
-+    void * mapptr;
-+};
-+
-+#if TRACE_ALLOC
-+static unsigned int total_bufs = 0;
-+static size_t total_size = 0;
-+#endif
-+
-+struct dmabuf_h * dmabuf_import(int fd, size_t size)
-+{
-+    struct dmabuf_h *dh;
-+
-+    fd = dup(fd);
-+    if (fd < 0  || size == 0)
-+        return NULL;
-+
-+    dh = malloc(sizeof(*dh));
-+    if (!dh) {
-+        close(fd);
-+        return NULL;
-+    }
-+
-+    *dh = (struct dmabuf_h) {
-+        .fd = fd,
-+        .size = size,
-+        .mapptr = MAP_FAILED
-+    };
-+
-+#if TRACE_ALLOC
-+    ++total_bufs;
-+    total_size += dh->size;
-+    request_log("%s: Import: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    return dh;
-+}
-+
-+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
-+{
-+    struct dmabuf_h * dh;
-+    struct dma_heap_allocation_data data = {
-+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
-+        .fd = 0,
-+        .fd_flags = O_RDWR,
-+        .heap_flags = 0
-+    };
-+
-+    if (old != NULL) {
-+        if (old->size == data.len) {
-+            return old;
-+        }
-+        dmabuf_free(old);
-+    }
-+
-+    if (size == 0 ||
-+        (dh = malloc(sizeof(*dh))) == NULL)
-+        return NULL;
-+
-+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
-+        int err = errno;
-+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
-+                (uint64_t)data.len,
-+                dbsc->fd,
-+                err,
-+                strerror(err));
-+        if (err == EINTR)
-+            continue;
-+        goto fail;
-+    }
-+
-+    *dh = (struct dmabuf_h){
-+        .fd = data.fd,
-+        .size = (size_t)data.len,
-+        .mapptr = MAP_FAILED
-+    };
-+
-+#if TRACE_ALLOC
-+    ++total_bufs;
-+    total_size += dh->size;
-+    request_log("%s: Alloc: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    return dh;
-+
-+fail:
-+    free(dh);
-+    return NULL;
-+}
-+
-+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
-+{
-+    struct dma_buf_sync sync = {
-+        .flags = flags
-+    };
-+    while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
-+        const int err = errno;
-+        if (errno == EINTR)
-+            continue;
-+        request_log("%s: ioctl failed: flags=%#x\n", __func__, flags);
-+        return -err;
-+    }
-+    return 0;
-+}
-+
-+int dmabuf_write_start(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_WRITE);
-+}
-+
-+int dmabuf_write_end(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_WRITE);
-+}
-+
-+int dmabuf_read_start(struct dmabuf_h * const dh)
-+{
-+    if (!dmabuf_map(dh))
-+        return -1;
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_START | DMA_BUF_SYNC_READ);
-+}
-+
-+int dmabuf_read_end(struct dmabuf_h * const dh)
-+{
-+    return dmabuf_sync(dh, DMA_BUF_SYNC_END | DMA_BUF_SYNC_READ);
-+}
-+
-+
-+void * dmabuf_map(struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return NULL;
-+    if (dh->mapptr != MAP_FAILED)
-+        return dh->mapptr;
-+    dh->mapptr = mmap(NULL, dh->size,
-+              PROT_READ | PROT_WRITE,
-+              MAP_SHARED | MAP_POPULATE,
-+              dh->fd, 0);
-+    if (dh->mapptr == MAP_FAILED) {
-+        request_log("%s: Map failed\n", __func__);
-+        return NULL;
-+    }
-+    return dh->mapptr;
-+}
-+
-+int dmabuf_fd(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return -1;
-+    return dh->fd;
-+}
-+
-+size_t dmabuf_size(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return 0;
-+    return dh->size;
-+}
-+
-+size_t dmabuf_len(const struct dmabuf_h * const dh)
-+{
-+    if (!dh)
-+        return 0;
-+    return dh->len;
-+}
-+
-+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
-+{
-+    dh->len = len;
-+}
-+
-+
-+
-+void dmabuf_free(struct dmabuf_h * dh)
-+{
-+    if (!dh)
-+        return;
-+
-+#if TRACE_ALLOC
-+    --total_bufs;
-+    total_size -= dh->size;
-+    request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
-+#endif
-+
-+    if (dh->mapptr != MAP_FAILED)
-+        munmap(dh->mapptr, dh->size);
-+    while (close(dh->fd) == -1 && errno == EINTR)
-+        /* loop */;
-+    free(dh);
-+}
-+
-+struct dmabufs_ctl * dmabufs_ctl_new(void)
-+{
-+    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
-+
-+    if (!dbsc)
-+        return NULL;
-+
-+    while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
-+           errno == EINTR)
-+        /* Loop */;
-+
-+    if (dbsc->fd == -1) {
-+        while ((dbsc->fd = open(DMABUF_NAME2, O_RDWR)) == -1 &&
-+               errno == EINTR)
-+            /* Loop */;
-+        if (dbsc->fd == -1) {
-+            request_log("Unable to open either %s or %s\n",
-+                    DMABUF_NAME1, DMABUF_NAME2);
-+            goto fail;
-+        }
-+    }
-+
-+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-+
-+    return dbsc;
-+
-+fail:
-+    free(dbsc);
-+    return NULL;
-+}
-+
-+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-+{
-+    struct dmabufs_ctl * const dbsc = *pDbsc;
-+
-+    if (!dbsc)
-+        return;
-+    *pDbsc = NULL;
-+
-+    while (close(dbsc->fd) == -1 && errno == EINTR)
-+        /* loop */;
-+
-+    free(dbsc);
-+}
-+
-+
-diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
-new file mode 100644
-index 0000000000..cfb17e801d
---- /dev/null
-+++ b/libavcodec/v4l2_req_dmabufs.h
-@@ -0,0 +1,40 @@
-+#ifndef DMABUFS_H
-+#define DMABUFS_H
-+
-+#include <stddef.h>
-+
-+struct dmabufs_ctl;
-+struct dmabuf_h;
-+
-+struct dmabufs_ctl * dmabufs_ctl_new(void);
-+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
-+
-+// Need not preserve old contents
-+// On NULL return old buffer is freed
-+struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h *, size_t size);
-+
-+static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t size) {
-+    return dmabuf_realloc(dbsc, NULL, size);
-+}
-+/* Create from existing fd - dups(fd) */
-+struct dmabuf_h * dmabuf_import(int fd, size_t size);
-+void * dmabuf_map(struct dmabuf_h * const dh);
-+
-+/* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
-+int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags);
-+
-+int dmabuf_write_start(struct dmabuf_h * const dh);
-+int dmabuf_write_end(struct dmabuf_h * const dh);
-+int dmabuf_read_start(struct dmabuf_h * const dh);
-+int dmabuf_read_end(struct dmabuf_h * const dh);
-+
-+int dmabuf_fd(const struct dmabuf_h * const dh);
-+/* Allocated size */
-+size_t dmabuf_size(const struct dmabuf_h * const dh);
-+/* Bytes in use */
-+size_t dmabuf_len(const struct dmabuf_h * const dh);
-+/* Set bytes in use */
-+void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len);
-+void dmabuf_free(struct dmabuf_h * dh);
-+
-+#endif
-diff --git a/libavcodec/v4l2_req_hevc_v1.c b/libavcodec/v4l2_req_hevc_v1.c
-new file mode 100644
-index 0000000000..169b532832
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v1.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 1
-+#include "v4l2_req_hevc_vx.c"
-+
-diff --git a/libavcodec/v4l2_req_hevc_v2.c b/libavcodec/v4l2_req_hevc_v2.c
-new file mode 100644
-index 0000000000..42af98e156
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v2.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 2
-+#include "v4l2_req_hevc_vx.c"
-+
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-new file mode 100644
-index 0000000000..0ae03b10c4
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -0,0 +1,1213 @@
-+// File included by v4l2_req_hevc_v* - not compiled on its own
-+
-+#include "decode.h"
-+#include "hevcdec.h"
-+#include "hwconfig.h"
-+#include "internal.h"
-+#include "thread.h"
-+
-+#include "v4l2_request_hevc.h"
-+
-+#if HEVC_CTRLS_VERSION == 1
-+#include "hevc-ctrls-v1.h"
-+
-+// Fixup renamed entries
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT
-+
-+#elif HEVC_CTRLS_VERSION == 2
-+#include "hevc-ctrls-v2.h"
-+#else
-+#error Unknown HEVC_CTRLS_VERSION
-+#endif
-+
-+#include "libavutil/hwcontext_drm.h"
-+
-+#include <semaphore.h>
-+#include <pthread.h>
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_utils.h"
-+
-+// Attached to buf[0] in frame
-+// Pooled in hwcontext so generally create once - 1/frame
-+typedef struct V4L2MediaReqDescriptor {
-+    AVDRMFrameDescriptor drm;
-+
-+    // Media
-+    uint64_t timestamp;
-+    struct qent_dst * qe_dst;
-+
-+    // Decode only - should be NULL by the time we emit the frame
-+    struct req_decode_ent decode_ent;
-+
-+    struct media_request *req;
-+    struct qent_src *qe_src;
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+    struct v4l2_ctrl_hevc_decode_params dec;
-+#endif
-+
-+    size_t num_slices;
-+    size_t alloced_slices;
-+    struct v4l2_ctrl_hevc_slice_params * slice_params;
-+    struct slice_info * slices;
-+
-+} V4L2MediaReqDescriptor;
-+
-+struct slice_info {
-+    const uint8_t * ptr;
-+    size_t len; // bytes
-+};
-+
-+// Handy container for accumulating controls before setting
-+struct req_controls {
-+    int has_scaling;
-+    struct timeval tv;
-+    struct v4l2_ctrl_hevc_sps sps;
-+    struct v4l2_ctrl_hevc_pps pps;
-+    struct v4l2_ctrl_hevc_scaling_matrix scaling_matrix;
-+};
-+
-+//static uint8_t nalu_slice_start_code[] = { 0x00, 0x00, 0x01 };
-+
-+
-+// Get an FFmpeg format from the v4l2 format
-+static enum AVPixelFormat pixel_format_from_format(const struct v4l2_format *const format)
-+{
-+    switch (V4L2_TYPE_IS_MULTIPLANAR(format->type) ?
-+            format->fmt.pix_mp.pixelformat : format->fmt.pix.pixelformat) {
-+    case V4L2_PIX_FMT_YUV420:
-+        return AV_PIX_FMT_YUV420P;
-+    case V4L2_PIX_FMT_NV12:
-+        return AV_PIX_FMT_NV12;
-+#if CONFIG_SAND
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        return AV_PIX_FMT_RPI4_8;
-+    case V4L2_PIX_FMT_NV12_10_COL128:
-+        return AV_PIX_FMT_RPI4_10;
-+#endif
-+    default:
-+        break;
-+    }
-+    return AV_PIX_FMT_NONE;
-+}
-+
-+static inline uint64_t frame_capture_dpb(const AVFrame * const frame)
-+{
-+    const V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
-+    return rd->timestamp;
-+}
-+
-+static inline void frame_set_capture_dpb(AVFrame * const frame, const uint64_t dpb_stamp)
-+{
-+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)frame->data[0];
-+    rd->timestamp = dpb_stamp;
-+}
-+
-+static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_table *table)
-+{
-+    int32_t luma_weight_denom, chroma_weight_denom;
-+    const SliceHeader *sh = &h->sh;
-+
-+    if (sh->slice_type == HEVC_SLICE_I ||
-+        (sh->slice_type == HEVC_SLICE_P && !h->ps.pps->weighted_pred_flag) ||
-+        (sh->slice_type == HEVC_SLICE_B && !h->ps.pps->weighted_bipred_flag))
-+        return;
-+
-+    table->luma_log2_weight_denom = sh->luma_log2_weight_denom;
-+
-+    if (h->ps.sps->chroma_format_idc)
-+        table->delta_chroma_log2_weight_denom = sh->chroma_log2_weight_denom - sh->luma_log2_weight_denom;
-+
-+    luma_weight_denom = (1 << sh->luma_log2_weight_denom);
-+    chroma_weight_denom = (1 << sh->chroma_log2_weight_denom);
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L0]; i++) {
-+        table->delta_luma_weight_l0[i] = sh->luma_weight_l0[i] - luma_weight_denom;
-+        table->luma_offset_l0[i] = sh->luma_offset_l0[i];
-+        table->delta_chroma_weight_l0[i][0] = sh->chroma_weight_l0[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l0[i][1] = sh->chroma_weight_l0[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l0[i][0] = sh->chroma_offset_l0[i][0];
-+        table->chroma_offset_l0[i][1] = sh->chroma_offset_l0[i][1];
-+    }
-+
-+    if (sh->slice_type != HEVC_SLICE_B)
-+        return;
-+
-+    for (int i = 0; i < 15 && i < sh->nb_refs[L1]; i++) {
-+        table->delta_luma_weight_l1[i] = sh->luma_weight_l1[i] - luma_weight_denom;
-+        table->luma_offset_l1[i] = sh->luma_offset_l1[i];
-+        table->delta_chroma_weight_l1[i][0] = sh->chroma_weight_l1[i][0] - chroma_weight_denom;
-+        table->delta_chroma_weight_l1[i][1] = sh->chroma_weight_l1[i][1] - chroma_weight_denom;
-+        table->chroma_offset_l1[i][0] = sh->chroma_offset_l1[i][0];
-+        table->chroma_offset_l1[i][1] = sh->chroma_offset_l1[i][1];
-+    }
-+}
-+
-+static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
-+{
-+    const HEVCFrame *frame;
-+    int i;
-+
-+    for (i = 0; i < h->rps[ST_CURR_BEF].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_BEF].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_BEFORE;
-+    }
-+
-+    for (i = 0; i < h->rps[ST_CURR_AFT].nb_refs; i++) {
-+        frame = h->rps[ST_CURR_AFT].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_ST_CURR_AFTER;
-+    }
-+
-+    for (i = 0; i < h->rps[LT_CURR].nb_refs; i++) {
-+        frame = h->rps[LT_CURR].ref[i];
-+        if (frame && timestamp == frame_capture_dpb(frame->frame))
-+            return V4L2_HEVC_DPB_ENTRY_RPS_LT_CURR;
-+    }
-+
-+    return 0;
-+}
-+
-+static unsigned int
-+get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
-+                  const struct v4l2_hevc_dpb_entry * const entries,
-+                  const unsigned int num_entries)
-+{
-+    uint64_t timestamp;
-+
-+    if (!frame)
-+        return 0;
-+
-+    timestamp = frame_capture_dpb(frame->frame);
-+
-+    for (unsigned int i = 0; i < num_entries; i++) {
-+        if (entries[i].timestamp == timestamp)
-+            return i;
-+    }
-+
-+    return 0;
-+}
-+
-+static const uint8_t * ptr_from_index(const uint8_t * b, unsigned int idx)
-+{
-+    unsigned int z = 0;
-+    while (idx--) {
-+        if (*b++ == 0) {
-+            ++z;
-+            if (z >= 2 && *b == 3) {
-+                ++b;
-+                z = 0;
-+            }
-+        }
-+        else {
-+            z = 0;
-+        }
-+    }
-+    return b;
-+}
-+
-+static int slice_add(V4L2MediaReqDescriptor * const rd)
-+{
-+    if (rd->num_slices >= rd->alloced_slices) {
-+        struct v4l2_ctrl_hevc_slice_params * p2;
-+        struct slice_info * s2;
-+        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
-+
-+        p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
-+        if (p2 == NULL)
-+            return AVERROR(ENOMEM);
-+        rd->slice_params = p2;
-+
-+        s2 = av_realloc_array(rd->slices, n2, sizeof(*s2));
-+        if (s2 == NULL)
-+            return AVERROR(ENOMEM);
-+        rd->slices = s2;
-+
-+        rd->alloced_slices = n2;
-+    }
-+    ++rd->num_slices;
-+    return 0;
-+}
-+
-+static unsigned int
-+fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
-+{
-+    unsigned int i;
-+    unsigned int n = 0;
-+    const HEVCFrame * const pic = h->ref;
-+
-+    for (i = 0; i < FF_ARRAY_ELEMS(h->DPB); i++) {
-+        const HEVCFrame * const frame = &h->DPB[i];
-+        if (frame != pic && (frame->flags & (HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF))) {
-+            struct v4l2_hevc_dpb_entry * const entry = entries + n++;
-+
-+            entry->timestamp = frame_capture_dpb(frame->frame);
-+            entry->rps = find_frame_rps_type(h, entry->timestamp);
-+            entry->field_pic = frame->frame->interlaced_frame;
-+
-+            /* TODO: Interleaved: Get the POC for each field. */
-+            entry->pic_order_cnt[0] = frame->poc;
-+            entry->pic_order_cnt[1] = frame->poc;
-+        }
-+    }
-+    return n;
-+}
-+
-+static void fill_slice_params(const HEVCContext * const h,
-+#if HEVC_CTRLS_VERSION >= 2
-+                              const struct v4l2_ctrl_hevc_decode_params * const dec,
-+#endif
-+                              struct v4l2_ctrl_hevc_slice_params *slice_params,
-+                              uint32_t bit_size, uint32_t bit_offset)
-+{
-+    const SliceHeader * const sh = &h->sh;
-+#if HEVC_CTRLS_VERSION >= 2
-+    const struct v4l2_hevc_dpb_entry *const dpb = dec->dpb;
-+    const unsigned int dpb_n = dec->num_active_dpb_entries;
-+#else
-+    struct v4l2_hevc_dpb_entry *const dpb = slice_params->dpb;
-+    unsigned int dpb_n;
-+#endif
-+    unsigned int i;
-+    RefPicList *rpl;
-+
-+    *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-+        .bit_size = bit_size,
-+        .data_bit_offset = bit_offset,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_segment_addr = sh->slice_segment_addr,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+        .nal_unit_type = h->nal_unit_type,
-+        .nuh_temporal_id_plus1 = h->temporal_id + 1,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .slice_type = sh->slice_type,
-+        .colour_plane_id = sh->colour_plane_id,
-+        .slice_pic_order_cnt = h->ref->poc,
-+        .num_ref_idx_l0_active_minus1 = sh->nb_refs[L0] ? sh->nb_refs[L0] - 1 : 0,
-+        .num_ref_idx_l1_active_minus1 = sh->nb_refs[L1] ? sh->nb_refs[L1] - 1 : 0,
-+        .collocated_ref_idx = sh->slice_temporal_mvp_enabled_flag ? sh->collocated_ref_idx : 0,
-+        .five_minus_max_num_merge_cand = sh->slice_type == HEVC_SLICE_I ? 0 : 5 - sh->max_num_merge_cand,
-+        .slice_qp_delta = sh->slice_qp_delta,
-+        .slice_cb_qp_offset = sh->slice_cb_qp_offset,
-+        .slice_cr_qp_offset = sh->slice_cr_qp_offset,
-+        .slice_act_y_qp_offset = 0,
-+        .slice_act_cb_qp_offset = 0,
-+        .slice_act_cr_qp_offset = 0,
-+        .slice_beta_offset_div2 = sh->beta_offset / 2,
-+        .slice_tc_offset_div2 = sh->tc_offset / 2,
-+
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+        .pic_struct = h->sei.picture_timing.picture_struct,
-+
-+#if HEVC_CTRLS_VERSION < 2
-+        /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+        .num_rps_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_rps_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_rps_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+#endif
-+    };
-+
-+    if (sh->slice_sample_adaptive_offset_flag[0])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA;
-+
-+    if (sh->slice_sample_adaptive_offset_flag[1])
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA;
-+
-+    if (sh->slice_temporal_mvp_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED;
-+
-+    if (sh->mvd_l1_zero_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO;
-+
-+    if (sh->cabac_init_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT;
-+
-+    if (sh->collocated_list == L0)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0;
-+
-+    if (sh->disable_deblocking_filter_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED;
-+
-+    if (sh->slice_loop_filter_across_slices_enabled_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (sh->dependent_slice_segment_flag)
-+        slice_params->flags |= V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT;
-+
-+#if HEVC_CTRLS_VERSION < 2
-+    dpb_n = fill_dpb_entries(h, dpb);
-+    slice_params->num_active_dpb_entries = dpb_n;
-+#endif
-+
-+    if (sh->slice_type != HEVC_SLICE_I) {
-+        rpl = &h->ref->refPicList[0];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l0[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
-+    }
-+
-+    if (sh->slice_type == HEVC_SLICE_B) {
-+        rpl = &h->ref->refPicList[1];
-+        for (i = 0; i < rpl->nb_refs; i++)
-+            slice_params->ref_idx_l1[i] = get_ref_pic_index(h, rpl->ref[i], dpb, dpb_n);
-+    }
-+
-+    fill_pred_table(h, &slice_params->pred_weight_table);
-+
-+    slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
-+    if (slice_params->num_entry_point_offsets > 256) {
-+        slice_params->num_entry_point_offsets = 256;
-+        av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-+    }
-+
-+    for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-+        slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
-+}
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+static void
-+fill_decode_params(const HEVCContext * const h,
-+                   struct v4l2_ctrl_hevc_decode_params * const dec)
-+{
-+    unsigned int i;
-+
-+    *dec = (struct v4l2_ctrl_hevc_decode_params){
-+        .pic_order_cnt_val = h->poc,
-+        .num_poc_st_curr_before = h->rps[ST_CURR_BEF].nb_refs,
-+        .num_poc_st_curr_after = h->rps[ST_CURR_AFT].nb_refs,
-+        .num_poc_lt_curr = h->rps[LT_CURR].nb_refs,
-+    };
-+
-+    dec->num_active_dpb_entries = fill_dpb_entries(h, dec->dpb);
-+
-+    // The docn does seem to ask that we fit our 32 bit signed POC into
-+    // a U8 so... (To be fair 16 bits would be enough)
-+    // Luckily we (Pi) don't use these fields
-+    for (i = 0; i != h->rps[ST_CURR_BEF].nb_refs; ++i)
-+        dec->poc_st_curr_before[i] = h->rps[ST_CURR_BEF].ref[i]->poc;
-+    for (i = 0; i != h->rps[ST_CURR_AFT].nb_refs; ++i)
-+        dec->poc_st_curr_after[i] = h->rps[ST_CURR_AFT].ref[i]->poc;
-+    for (i = 0; i != h->rps[LT_CURR].nb_refs; ++i)
-+        dec->poc_lt_curr[i] = h->rps[LT_CURR].ref[i]->poc;
-+
-+    if (IS_IRAP(h))
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC;
-+    if (IS_IDR(h))
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC;
-+    if (h->sh.no_output_of_prior_pics_flag)
-+        dec->flags |= V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR;
-+
-+}
-+#endif
-+
-+static void fill_sps(struct v4l2_ctrl_hevc_sps *ctrl, const HEVCSPS *sps)
-+{
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+    *ctrl = (struct v4l2_ctrl_hevc_sps) {
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .pic_width_in_luma_samples = sps->width,
-+        .pic_height_in_luma_samples = sps->height,
-+        .bit_depth_luma_minus8 = sps->bit_depth - 8,
-+        .bit_depth_chroma_minus8 = sps->bit_depth - 8,
-+        .log2_max_pic_order_cnt_lsb_minus4 = sps->log2_max_poc_lsb - 4,
-+        .sps_max_dec_pic_buffering_minus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering - 1,
-+        .sps_max_num_reorder_pics = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics,
-+        .sps_max_latency_increase_plus1 = sps->temporal_layer[sps->max_sub_layers - 1].max_latency_increase + 1,
-+        .log2_min_luma_coding_block_size_minus3 = sps->log2_min_cb_size - 3,
-+        .log2_diff_max_min_luma_coding_block_size = sps->log2_diff_max_min_coding_block_size,
-+        .log2_min_luma_transform_block_size_minus2 = sps->log2_min_tb_size - 2,
-+        .log2_diff_max_min_luma_transform_block_size = sps->log2_max_trafo_size - sps->log2_min_tb_size,
-+        .max_transform_hierarchy_depth_inter = sps->max_transform_hierarchy_depth_inter,
-+        .max_transform_hierarchy_depth_intra = sps->max_transform_hierarchy_depth_intra,
-+        .pcm_sample_bit_depth_luma_minus1 = sps->pcm.bit_depth - 1,
-+        .pcm_sample_bit_depth_chroma_minus1 = sps->pcm.bit_depth_chroma - 1,
-+        .log2_min_pcm_luma_coding_block_size_minus3 = sps->pcm.log2_min_pcm_cb_size - 3,
-+        .log2_diff_max_min_pcm_luma_coding_block_size = sps->pcm.log2_max_pcm_cb_size - sps->pcm.log2_min_pcm_cb_size,
-+        .num_short_term_ref_pic_sets = sps->nb_st_rps,
-+        .num_long_term_ref_pics_sps = sps->num_long_term_ref_pics_sps,
-+        .chroma_format_idc = sps->chroma_format_idc,
-+        .sps_max_sub_layers_minus1 = sps->max_sub_layers - 1,
-+    };
-+
-+    if (sps->separate_colour_plane_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE;
-+
-+    if (sps->scaling_list_enable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED;
-+
-+    if (sps->amp_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_AMP_ENABLED;
-+
-+    if (sps->sao_enabled)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET;
-+
-+    if (sps->pcm_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_ENABLED;
-+
-+    if (sps->pcm.loop_filter_disable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED;
-+
-+    if (sps->long_term_ref_pics_present_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT;
-+
-+    if (sps->sps_temporal_mvp_enabled_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED;
-+
-+    if (sps->sps_strong_intra_smoothing_enable_flag)
-+        ctrl->flags |= V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED;
-+}
-+
-+static void fill_scaling_matrix(const ScalingList * const sl,
-+                                struct v4l2_ctrl_hevc_scaling_matrix * const sm)
-+{
-+    unsigned int i;
-+
-+    for (i = 0; i < 6; i++) {
-+        unsigned int j;
-+
-+        for (j = 0; j < 16; j++)
-+            sm->scaling_list_4x4[i][j] = sl->sl[0][i][j];
-+        for (j = 0; j < 64; j++) {
-+            sm->scaling_list_8x8[i][j]   = sl->sl[1][i][j];
-+            sm->scaling_list_16x16[i][j] = sl->sl[2][i][j];
-+            if (i < 2)
-+                sm->scaling_list_32x32[i][j] = sl->sl[3][i * 3][j];
-+        }
-+        sm->scaling_list_dc_coef_16x16[i] = sl->sl_dc[0][i];
-+        if (i < 2)
-+            sm->scaling_list_dc_coef_32x32[i] = sl->sl_dc[1][i * 3];
-+    }
-+}
-+
-+static void fill_pps(struct v4l2_ctrl_hevc_pps * const ctrl, const HEVCPPS * const pps)
-+{
-+    uint64_t flags = 0;
-+
-+    if (pps->dependent_slice_segments_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED;
-+
-+    if (pps->output_flag_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT;
-+
-+    if (pps->sign_data_hiding_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED;
-+
-+    if (pps->cabac_init_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT;
-+
-+    if (pps->constrained_intra_pred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED;
-+
-+    if (pps->transform_skip_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED;
-+
-+    if (pps->cu_qp_delta_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED;
-+
-+    if (pps->pic_slice_level_chroma_qp_offsets_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT;
-+
-+    if (pps->weighted_pred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED;
-+
-+    if (pps->weighted_bipred_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED;
-+
-+    if (pps->transquant_bypass_enable_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED;
-+
-+    if (pps->tiles_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_TILES_ENABLED;
-+
-+    if (pps->entropy_coding_sync_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED;
-+
-+    if (pps->loop_filter_across_tiles_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED;
-+
-+    if (pps->seq_loop_filter_across_slices_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED;
-+
-+    if (pps->deblocking_filter_override_enabled_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED;
-+
-+    if (pps->disable_dbf)
-+        flags |= V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER;
-+
-+    if (pps->lists_modification_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT;
-+
-+    if (pps->slice_header_extension_present_flag)
-+        flags |= V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT;
-+
-+    /* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+    *ctrl = (struct v4l2_ctrl_hevc_pps) {
-+        .num_extra_slice_header_bits = pps->num_extra_slice_header_bits,
-+        .init_qp_minus26 = pps->pic_init_qp_minus26,
-+        .diff_cu_qp_delta_depth = pps->diff_cu_qp_delta_depth,
-+        .pps_cb_qp_offset = pps->cb_qp_offset,
-+        .pps_cr_qp_offset = pps->cr_qp_offset,
-+        .pps_beta_offset_div2 = pps->beta_offset / 2,
-+        .pps_tc_offset_div2 = pps->tc_offset / 2,
-+        .log2_parallel_merge_level_minus2 = pps->log2_parallel_merge_level - 2,
-+        .flags = flags
-+    };
-+
-+
-+    if (pps->tiles_enabled_flag) {
-+        ctrl->num_tile_columns_minus1 = pps->num_tile_columns - 1;
-+        ctrl->num_tile_rows_minus1 = pps->num_tile_rows - 1;
-+
-+        for (int i = 0; i < pps->num_tile_columns; i++)
-+            ctrl->column_width_minus1[i] = pps->column_width[i] - 1;
-+
-+        for (int i = 0; i < pps->num_tile_rows; i++)
-+            ctrl->row_height_minus1[i] = pps->row_height[i] - 1;
-+    }
-+}
-+
-+// Called before finally returning the frame to the user
-+// Set corrupt flag here as this is actually the frame structure that
-+// is going to the user (in MT land each thread has its own pool)
-+static int frame_post_process(void *logctx, AVFrame *frame)
-+{
-+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)frame->data[0];
-+
-+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
-+    frame->flags &= ~AV_FRAME_FLAG_CORRUPT;
-+    if (rd->qe_dst) {
-+        MediaBufsStatus stat = qent_dst_wait(rd->qe_dst);
-+        if (stat != MEDIABUFS_STATUS_SUCCESS) {
-+            av_log(logctx, AV_LOG_ERROR, "%s: Decode fail\n", __func__);
-+            frame->flags |= AV_FRAME_FLAG_CORRUPT;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static inline struct timeval cvt_dpb_to_tv(uint64_t t)
-+{
-+    t /= 1000;
-+    return (struct timeval){
-+        .tv_usec = t % 1000000,
-+        .tv_sec = t / 1000000
-+    };
-+}
-+
-+static inline uint64_t cvt_timestamp_to_dpb(const unsigned int t)
-+{
-+    return (uint64_t)t * 1000;
-+}
-+
-+static int v4l2_request_hevc_start_frame(AVCodecContext *avctx,
-+                                         av_unused const uint8_t *buffer,
-+                                         av_unused uint32_t size)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+//    av_log(NULL, AV_LOG_INFO, "%s\n", __func__);
-+    decode_q_add(&ctx->decode_q, &rd->decode_ent);
-+
-+    rd->num_slices = 0;
-+    ctx->timestamp++;
-+    rd->timestamp = cvt_timestamp_to_dpb(ctx->timestamp);
-+
-+    {
-+        FrameDecodeData * const fdd = (FrameDecodeData*)h->ref->frame->private_ref->data;
-+        fdd->post_process = frame_post_process;
-+    }
-+
-+    // qe_dst needs to be bound to the data buffer and only returned when that is
-+    if (!rd->qe_dst)
-+    {
-+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
-+            return AVERROR(ENOMEM);
-+        }
-+    }
-+
-+    ff_thread_finish_setup(avctx); // Allow next thread to enter rpi_hevc_start_frame
-+
-+    return 0;
-+}
-+
-+// Object fd & size will be zapped by this & need setting later
-+static int drm_from_format(AVDRMFrameDescriptor * const desc, const struct v4l2_format * const format)
-+{
-+    AVDRMLayerDescriptor *layer = &desc->layers[0];
-+    unsigned int width;
-+    unsigned int height;
-+    unsigned int bpl;
-+    uint32_t pixelformat;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
-+        width       = format->fmt.pix_mp.width;
-+        height      = format->fmt.pix_mp.height;
-+        pixelformat = format->fmt.pix_mp.pixelformat;
-+        bpl         = format->fmt.pix_mp.plane_fmt[0].bytesperline;
-+    }
-+    else {
-+        width       = format->fmt.pix.width;
-+        height      = format->fmt.pix.height;
-+        pixelformat = format->fmt.pix.pixelformat;
-+        bpl         = format->fmt.pix.bytesperline;
-+    }
-+
-+    switch (pixelformat) {
-+    case V4L2_PIX_FMT_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#if CONFIG_SAND
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
-+        break;
-+    case V4L2_PIX_FMT_NV12_10_COL128:
-+        layer->format = DRM_FORMAT_P030;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl);
-+        break;
-+#endif
-+#ifdef DRM_FORMAT_MOD_ALLWINNER_TILED
-+    case V4L2_PIX_FMT_SUNXI_TILED_NV12:
-+        layer->format = DRM_FORMAT_NV12;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_ALLWINNER_TILED;
-+        break;
-+#endif
-+#if defined(V4L2_PIX_FMT_NV15) && defined(DRM_FORMAT_NV15)
-+    case V4L2_PIX_FMT_NV15:
-+        layer->format = DRM_FORMAT_NV15;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#endif
-+    case V4L2_PIX_FMT_NV16:
-+        layer->format = DRM_FORMAT_NV16;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#if defined(V4L2_PIX_FMT_NV20) && defined(DRM_FORMAT_NV20)
-+    case V4L2_PIX_FMT_NV20:
-+        layer->format = DRM_FORMAT_NV20;
-+        desc->objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        break;
-+#endif
-+    default:
-+        return -1;
-+    }
-+
-+    desc->nb_objects = 1;
-+    desc->objects[0].fd = -1;
-+    desc->objects[0].size = 0;
-+
-+    desc->nb_layers = 1;
-+    layer->nb_planes = 2;
-+
-+    layer->planes[0].object_index = 0;
-+    layer->planes[0].offset = 0;
-+    layer->planes[0].pitch = bpl;
-+#if CONFIG_SAND
-+    if (pixelformat == V4L2_PIX_FMT_NV12_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = height * 128;
-+        layer->planes[0].pitch = width;
-+        layer->planes[1].pitch = width;
-+    }
-+    else if (pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = height * 128;
-+        layer->planes[0].pitch = width * 2; // Lies but it keeps DRM import happy
-+        layer->planes[1].pitch = width * 2;
-+    }
-+    else
-+#endif
-+    {
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = layer->planes[0].pitch * height;
-+        layer->planes[1].pitch = layer->planes[0].pitch;
-+    }
-+
-+    return 0;
-+}
-+
-+static int
-+set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
-+    struct req_controls *const controls,
-+#if HEVC_CTRLS_VERSION >= 2
-+    struct v4l2_ctrl_hevc_decode_params * const dec,
-+#endif
-+    struct v4l2_ctrl_hevc_slice_params * const slices,
-+    const unsigned int slice_no,
-+    const unsigned int slice_count)
-+{
-+    int rv;
-+
-+    struct v4l2_ext_control control[] = {
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .ptr = &controls->sps,
-+            .size = sizeof(controls->sps),
-+        },
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
-+            .ptr = &controls->pps,
-+            .size = sizeof(controls->pps),
-+        },
-+#if HEVC_CTRLS_VERSION >= 2
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
-+            .ptr = dec,
-+            .size = sizeof(*dec),
-+        },
-+#endif
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .ptr = slices + slice_no,
-+            .size = sizeof(*slices) * slice_count,
-+        },
-+        // Optional
-+        {
-+            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .ptr = &controls->scaling_matrix,
-+            .size = sizeof(controls->scaling_matrix),
-+        },
-+    };
-+
-+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
-+            controls->has_scaling ?
-+                FF_ARRAY_ELEMS(control) :
-+                FF_ARRAY_ELEMS(control) - 1);
-+
-+    return rv;
-+}
-+
-+static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
-+    int bcount = get_bits_count(&h->HEVClc->gb);
-+    uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
-+
-+    int rv;
-+    struct slice_info * si;
-+
-+    if ((rv = slice_add(rd)) != 0)
-+        return rv;
-+
-+    si = rd->slices + rd->num_slices - 1;
-+    si->ptr = buffer;
-+    si->len = size;
-+
-+    if (ctx->multi_slice && rd->num_slices > 1) {
-+        struct slice_info *const si0 = rd->slices;
-+        const size_t offset = (buffer - si0->ptr);
-+        boff += offset * 8;
-+        size += offset;
-+        si0->len = si->len + offset;
-+    }
-+
-+#if HEVC_CTRLS_VERSION >= 2
-+    if (rd->num_slices == 1)
-+        fill_decode_params(h, &rd->dec);
-+    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+#else
-+    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+#endif
-+
-+    return 0;
-+}
-+
-+static void v4l2_request_hevc_abort_frame(AVCodecContext * const avctx)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    if (h->ref != NULL) {
-+        V4L2MediaReqDescriptor *const rd = (V4L2MediaReqDescriptor *)h->ref->frame->data[0];
-+        V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+        media_request_abort(&rd->req);
-+        mediabufs_src_qent_abort(ctx->mbufs, &rd->qe_src);
-+
-+        decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    }
-+}
-+
-+static int send_slice(AVCodecContext * const avctx,
-+                      V4L2MediaReqDescriptor * const rd,
-+                      struct req_controls *const controls,
-+                      const unsigned int i, const unsigned int j)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+    struct slice_info *const si = rd->slices + i;
-+    struct media_request * req = NULL;
-+    struct qent_src * src = NULL;
-+    MediaBufsStatus stat;
-+
-+    if ((req = media_request_get(ctx->mpool)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    if (set_req_ctls(ctx, req,
-+                     controls,
-+#if HEVC_CTRLS_VERSION >= 2
-+                     &rd->dec,
-+#endif
-+                     rd->slice_params,
-+                     i, j - i)) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
-+        goto fail1;
-+    }
-+
-+    if ((src = mediabufs_src_qent_get(ctx->mbufs)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to get src buffer\n", __func__);
-+        goto fail1;
-+    }
-+
-+    if (qent_src_data_copy(src, 0, si->ptr, si->len, ctx->dbufs) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed data copy\n", __func__);
-+        goto fail2;
-+    }
-+
-+    if (qent_src_params_set(src, &controls->tv)) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed src param set\n", __func__);
-+        goto fail2;
-+    }
-+
-+#warning ANNEX_B start code
-+//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+//        }
-+
-+    stat = mediabufs_start_request(ctx->mbufs, &req, &src,
-+                                   i == 0 ? rd->qe_dst : NULL,
-+                                   j == rd->num_slices);
-+
-+    if (stat != MEDIABUFS_STATUS_SUCCESS) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
-+        return AVERROR_UNKNOWN;
-+    }
-+    return 0;
-+
-+fail2:
-+    mediabufs_src_qent_abort(ctx->mbufs, &src);
-+fail1:
-+    media_request_abort(&req);
-+    return AVERROR_UNKNOWN;
-+}
-+
-+static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    const HEVCContext * const h = avctx->priv_data;
-+    V4L2MediaReqDescriptor *rd = (V4L2MediaReqDescriptor*)h->ref->frame->data[0];
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    struct req_controls rc;
-+    unsigned int i;
-+    int rv;
-+
-+    // It is possible, though maybe a bug, to get an end_frame without
-+    // a previous start_frame.  If we do then give up.
-+    if (!decode_q_in_q(&rd->decode_ent)) {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: Frame not in decode Q\n", __func__);
-+        return AVERROR_INVALIDDATA;
-+    }
-+
-+    {
-+        const ScalingList *sl = h->ps.pps->scaling_list_data_present_flag ?
-+                                    &h->ps.pps->scaling_list :
-+                                h->ps.sps->scaling_list_enable_flag ?
-+                                    &h->ps.sps->scaling_list : NULL;
-+
-+
-+        memset(&rc, 0, sizeof(rc));
-+        rc.tv = cvt_dpb_to_tv(rd->timestamp);
-+        fill_sps(&rc.sps, h->ps.sps);
-+        fill_pps(&rc.pps, h->ps.pps);
-+        if (sl) {
-+            rc.has_scaling = 1;
-+            fill_scaling_matrix(sl, &rc.scaling_matrix);
-+        }
-+    }
-+
-+    decode_q_wait(&ctx->decode_q, &rd->decode_ent);
-+
-+    // qe_dst needs to be bound to the data buffer and only returned when that is
-+    // Alloc almost certainly wants to be serialised if there is any chance of blocking
-+    // so we get the next frame to be free in the thread that needs it for decode first.
-+    //
-+    // In our current world this probably isn't a concern but put it here anyway
-+    if (!rd->qe_dst)
-+    {
-+        if ((rd->qe_dst = mediabufs_dst_qent_alloc(ctx->mbufs, ctx->dbufs)) == NULL) {
-+            av_log(avctx, AV_LOG_ERROR, "%s: Failed to get dst buffer\n", __func__);
-+            rv = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-+    }
-+
-+    // Send as slices
-+    if (ctx->multi_slice)
-+    {
-+        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
-+            goto fail;
-+    }
-+    else
-+    {
-+        for (i = 0; i != rd->num_slices; ++i) {
-+            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
-+                goto fail;
-+        }
-+    }
-+
-+    // Set the drm_prime desriptor
-+    drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
-+    rd->drm.objects[0].fd = dmabuf_fd(qent_dst_dmabuf(rd->qe_dst, 0));
-+    rd->drm.objects[0].size = dmabuf_size(qent_dst_dmabuf(rd->qe_dst, 0));
-+
-+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    return 0;
-+
-+fail:
-+    decode_q_remove(&ctx->decode_q, &rd->decode_ent);
-+    return rv;
-+}
-+
-+// Initial check & init
-+static int
-+probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    const HEVCSPS * const sps = h->ps.sps;
-+    struct v4l2_ctrl_hevc_sps ctrl_sps;
-+    unsigned int i;
-+
-+    // Check for var slice array
-+    struct v4l2_query_ext_ctrl qc[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
-+#if HEVC_CTRLS_VERSION >= 2
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
-+#endif
-+    };
-+    // Order & size must match!
-+    static const size_t ctrl_sizes[] = {
-+        sizeof(struct v4l2_ctrl_hevc_slice_params),
-+        sizeof(struct v4l2_ctrl_hevc_sps),
-+        sizeof(struct v4l2_ctrl_hevc_pps),
-+        sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
-+#if HEVC_CTRLS_VERSION >= 2
-+        sizeof(struct v4l2_ctrl_hevc_decode_params),
-+#endif
-+    };
-+    const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
-+
-+    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-+        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
-+        return AVERROR(EINVAL);
-+    }
-+    for (i = 0; i != noof_ctrls; ++i) {
-+        if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
-+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
-+                   HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+
-+    fill_sps(&ctrl_sps, sps);
-+
-+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
-+    return 0;
-+}
-+
-+// Final init
-+static int
-+set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-+{
-+    int ret;
-+
-+    struct v4l2_query_ext_ctrl querys[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
-+    };
-+
-+    struct v4l2_ext_control ctrls[] = {
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+    };
-+
-+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
-+
-+    ctx->decode_mode = querys[0].default_value;
-+
-+    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->start_code = querys[1].default_value;
-+    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctx->max_slices = querys[2].elems;
-+    if (ctx->max_slices > MAX_SLICES) {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ctrls[0].value = ctx->decode_mode;
-+    ctrls[1].value = ctx->start_code;
-+
-+    ret = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, NULL, ctrls, FF_ARRAY_ELEMS(ctrls));
-+    return !ret ? 0 : AVERROR(-ret);
-+}
-+
-+static void v4l2_req_frame_free(void *opaque, uint8_t *data)
-+{
-+    AVCodecContext *avctx = opaque;
-+    V4L2MediaReqDescriptor * const rd = (V4L2MediaReqDescriptor*)data;
-+
-+    av_log(NULL, AV_LOG_DEBUG, "%s: avctx=%p data=%p\n", __func__, avctx, data);
-+
-+    qent_dst_unref(&rd->qe_dst);
-+
-+    // We don't expect req or qe_src to be set
-+    if (rd->req || rd->qe_src)
-+        av_log(NULL, AV_LOG_ERROR, "%s: qe_src %p or req %p not NULL\n", __func__, rd->req, rd->qe_src);
-+
-+    av_freep(&rd->slices);
-+    av_freep(&rd->slice_params);
-+
-+    av_free(rd);
-+}
-+
-+static AVBufferRef *v4l2_req_frame_alloc(void *opaque, int size)
-+{
-+    AVCodecContext *avctx = opaque;
-+//    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+//    V4L2MediaReqDescriptor *req;
-+    AVBufferRef *ref;
-+    uint8_t *data;
-+//    int ret;
-+
-+    data = av_mallocz(size);
-+    if (!data)
-+        return NULL;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p size=%d data=%p\n", __func__, avctx, size, data);
-+    ref = av_buffer_create(data, size, v4l2_req_frame_free, avctx, 0);
-+    if (!ref) {
-+        av_freep(&data);
-+        return NULL;
-+    }
-+    return ref;
-+}
-+
-+#if 0
-+static void v4l2_req_pool_free(void *opaque)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: opaque=%p\n", __func__, opaque);
-+}
-+
-+static void v4l2_req_hwframe_ctx_free(AVHWFramesContext *hwfc)
-+{
-+    av_log(NULL, AV_LOG_DEBUG, "%s: hwfc=%p pool=%p\n", __func__, hwfc, hwfc->pool);
-+
-+    av_buffer_pool_uninit(&hwfc->pool);
-+}
-+#endif
-+
-+static int frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    AVHWFramesContext *hwfc = (AVHWFramesContext*)hw_frames_ctx->data;
-+    const struct v4l2_format *vfmt = mediabufs_dst_fmt(ctx->mbufs);
-+
-+    hwfc->format = AV_PIX_FMT_DRM_PRIME;
-+    hwfc->sw_format = pixel_format_from_format(vfmt);
-+    if (V4L2_TYPE_IS_MULTIPLANAR(vfmt->type)) {
-+        hwfc->width = vfmt->fmt.pix_mp.width;
-+        hwfc->height = vfmt->fmt.pix_mp.height;
-+    } else {
-+        hwfc->width = vfmt->fmt.pix.width;
-+        hwfc->height = vfmt->fmt.pix.height;
-+    }
-+#if 0
-+    hwfc->pool = av_buffer_pool_init2(sizeof(V4L2MediaReqDescriptor), avctx, v4l2_req_frame_alloc, v4l2_req_pool_free);
-+    if (!hwfc->pool)
-+        return AVERROR(ENOMEM);
-+
-+    hwfc->free = v4l2_req_hwframe_ctx_free;
-+
-+    hwfc->initial_pool_size = 1;
-+
-+    switch (avctx->codec_id) {
-+    case AV_CODEC_ID_VP9:
-+        hwfc->initial_pool_size += 8;
-+        break;
-+    case AV_CODEC_ID_VP8:
-+        hwfc->initial_pool_size += 3;
-+        break;
-+    default:
-+        hwfc->initial_pool_size += 2;
-+    }
-+#endif
-+    av_log(avctx, AV_LOG_DEBUG, "%s: avctx=%p ctx=%p hw_frames_ctx=%p hwfc=%p pool=%p width=%d height=%d initial_pool_size=%d\n", __func__, avctx, ctx, hw_frames_ctx, hwfc, hwfc->pool, hwfc->width, hwfc->height, hwfc->initial_pool_size);
-+
-+    return 0;
-+}
-+
-+static int alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    int rv;
-+
-+    frame->buf[0] = v4l2_req_frame_alloc(avctx, sizeof(V4L2MediaReqDescriptor));
-+    if (!frame->buf[0])
-+        return AVERROR(ENOMEM);
-+
-+    frame->data[0] = frame->buf[0]->data;
-+
-+    frame->hw_frames_ctx = av_buffer_ref(avctx->hw_frames_ctx);
-+
-+    if ((rv = ff_attach_decode_data(frame)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to attach decode data to frame\n");
-+        av_frame_unref(frame);
-+        return rv;
-+    }
-+
-+    return 0;
-+}
-+
-+const v4l2_req_decode_fns V(ff_v4l2_req_hevc) = {
-+    .src_pix_fmt_v4l2 = V4L2_PIX_FMT_HEVC_SLICE,
-+    .name = "V4L2 HEVC stateless V" STR(HEVC_CTRLS_VERSION),
-+    .probe = probe,
-+    .set_controls = set_controls,
-+
-+    .start_frame    = v4l2_request_hevc_start_frame,
-+    .decode_slice   = v4l2_request_hevc_decode_slice,
-+    .end_frame      = v4l2_request_hevc_end_frame,
-+    .abort_frame    = v4l2_request_hevc_abort_frame,
-+    .frame_params   = frame_params,
-+    .alloc_frame    = alloc_frame,
-+};
-+
-diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
-new file mode 100644
-index 0000000000..eb00ecb406
---- /dev/null
-+++ b/libavcodec/v4l2_req_media.c
-@@ -0,0 +1,1596 @@
-+/*
-+ * Copyright (C) 2018 Paul Kocialkowski <paul.kocialkowski@bootlin.com>
-+ *
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#include <errno.h>
-+#include <fcntl.h>
-+#include <poll.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <stdbool.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <unistd.h>
-+#include <linux/media.h>
-+#include <sys/ioctl.h>
-+#include <sys/select.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/videodev2.h>
-+
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_utils.h"
-+#include "weak_link.h"
-+
-+
-+/* floor(log2(x)) */
-+static unsigned int log2_size(size_t x)
-+{
-+    unsigned int n = 0;
-+
-+    if (x & ~0xffff) {
-+        n += 16;
-+        x >>= 16;
-+    }
-+    if (x & ~0xff) {
-+        n += 8;
-+        x >>= 8;
-+    }
-+    if (x & ~0xf) {
-+        n += 4;
-+        x >>= 4;
-+    }
-+    if (x & ~3) {
-+        n += 2;
-+        x >>= 2;
-+    }
-+    return (x & ~1) ? n + 1 : n;
-+}
-+
-+static size_t round_up_size(const size_t x)
-+{
-+    /* Admit no size < 256 */
-+    const unsigned int n = x < 256 ? 8 : log2_size(x) - 1;
-+
-+    return x >= (3 << n) ? 4 << n : (3 << n);
-+}
-+
-+struct media_request;
-+
-+struct media_pool {
-+    int fd;
-+    sem_t sem;
-+    pthread_mutex_t lock;
-+    struct media_request * free_reqs;
-+    struct pollqueue * pq;
-+};
-+
-+struct media_request {
-+    struct media_request * next;
-+    struct media_pool * mp;
-+    int fd;
-+    struct polltask * pt;
-+};
-+
-+
-+static inline int do_trywait(sem_t *const sem)
-+{
-+    while (sem_trywait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static inline int do_wait(sem_t *const sem)
-+{
-+    while (sem_wait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static int request_buffers(int video_fd, unsigned int type,
-+                           enum v4l2_memory memory, unsigned int buffers_count)
-+{
-+    struct v4l2_requestbuffers buffers;
-+    int rc;
-+
-+    memset(&buffers, 0, sizeof(buffers));
-+    buffers.type = type;
-+    buffers.memory = memory;
-+    buffers.count = buffers_count;
-+
-+    rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
-+    if (rc < 0) {
-+        rc = -errno;
-+        request_log("Unable to request %d type %d buffers: %s\n", buffers_count, type, strerror(-rc));
-+        return rc;
-+    }
-+
-+    return 0;
-+}
-+
-+
-+static int set_stream(int video_fd, unsigned int type, bool enable)
-+{
-+    enum v4l2_buf_type buf_type = type;
-+    int rc;
-+
-+    rc = ioctl(video_fd, enable ? VIDIOC_STREAMON : VIDIOC_STREAMOFF,
-+           &buf_type);
-+    if (rc < 0) {
-+        rc = -errno;
-+        request_log("Unable to %sable stream: %s\n",
-+                enable ? "en" : "dis", strerror(-rc));
-+        return rc;
-+    }
-+
-+    return 0;
-+}
-+
-+
-+
-+struct media_request * media_request_get(struct media_pool * const mp)
-+{
-+    struct media_request *req = NULL;
-+
-+    /* Timeout handled by poll code */
-+    if (do_wait(&mp->sem))
-+        return NULL;
-+
-+    pthread_mutex_lock(&mp->lock);
-+    req = mp->free_reqs;
-+    if (req) {
-+        mp->free_reqs = req->next;
-+        req->next = NULL;
-+    }
-+    pthread_mutex_unlock(&mp->lock);
-+    return req;
-+}
-+
-+int media_request_fd(const struct media_request * const req)
-+{
-+    return req->fd;
-+}
-+
-+int media_request_start(struct media_request * const req)
-+{
-+    while (ioctl(req->fd, MEDIA_REQUEST_IOC_QUEUE, NULL) == -1)
-+    {
-+        const int err = errno;
-+        if (err == EINTR)
-+            continue;
-+        request_log("%s: Failed to Q media: (%d) %s\n", __func__, err, strerror(err));
-+        return -err;
-+    }
-+
-+    pollqueue_add_task(req->pt, 2000);
-+    return 0;
-+}
-+
-+static void media_request_done(void *v, short revents)
-+{
-+    struct media_request *const req = v;
-+    struct media_pool *const mp = req->mp;
-+
-+    /* ** Not sure what to do about timeout */
-+
-+    if (ioctl(req->fd, MEDIA_REQUEST_IOC_REINIT, NULL) < 0)
-+        request_log("Unable to reinit media request: %s\n",
-+                strerror(errno));
-+
-+    pthread_mutex_lock(&mp->lock);
-+    req->next = mp->free_reqs;
-+    mp->free_reqs = req;
-+    pthread_mutex_unlock(&mp->lock);
-+    sem_post(&mp->sem);
-+}
-+
-+int media_request_abort(struct media_request ** const preq)
-+{
-+    struct media_request * const req = *preq;
-+
-+    if (req == NULL)
-+        return 0;
-+    *preq = NULL;
-+
-+    media_request_done(req, 0);
-+    return 0;
-+}
-+
-+static void delete_req_chain(struct media_request * const chain)
-+{
-+    struct media_request * next = chain;
-+    while (next) {
-+        struct media_request * const req = next;
-+        next = req->next;
-+        if (req->pt)
-+            polltask_delete(&req->pt);
-+        if (req->fd != -1)
-+            close(req->fd);
-+        free(req);
-+    }
-+}
-+
-+struct media_pool * media_pool_new(const char * const media_path,
-+                   struct pollqueue * const pq,
-+                   const unsigned int n)
-+{
-+    struct media_pool * const mp = calloc(1, sizeof(*mp));
-+    unsigned int i;
-+
-+    if (!mp)
-+        goto fail0;
-+
-+    mp->pq = pq;
-+    pthread_mutex_init(&mp->lock, NULL);
-+    mp->fd = open(media_path, O_RDWR | O_NONBLOCK);
-+    if (mp->fd == -1) {
-+        request_log("Failed to open '%s': %s\n", media_path, strerror(errno));
-+        goto fail1;
-+    }
-+
-+    for (i = 0; i != n; ++i) {
-+        struct media_request * req = malloc(sizeof(*req));
-+        if (!req)
-+            goto fail4;
-+
-+        *req = (struct media_request){
-+            .next = mp->free_reqs,
-+            .mp = mp,
-+            .fd = -1
-+        };
-+        mp->free_reqs = req;
-+
-+        if (ioctl(mp->fd, MEDIA_IOC_REQUEST_ALLOC, &req->fd) == -1) {
-+            request_log("Failed to alloc request %d: %s\n", i, strerror(errno));
-+            goto fail4;
-+        }
-+
-+        req->pt = polltask_new(pq, req->fd, POLLPRI, media_request_done, req);
-+        if (!req->pt)
-+            goto fail4;
-+    }
-+
-+    sem_init(&mp->sem, 0, n);
-+
-+    return mp;
-+
-+fail4:
-+    delete_req_chain(mp->free_reqs);
-+    close(mp->fd);
-+    pthread_mutex_destroy(&mp->lock);
-+fail1:
-+    free(mp);
-+fail0:
-+    return NULL;
-+}
-+
-+void media_pool_delete(struct media_pool ** pMp)
-+{
-+    struct media_pool * const mp = *pMp;
-+
-+    if (!mp)
-+        return;
-+    *pMp = NULL;
-+
-+    delete_req_chain(mp->free_reqs);
-+    close(mp->fd);
-+    sem_destroy(&mp->sem);
-+    pthread_mutex_destroy(&mp->lock);
-+    free(mp);
-+}
-+
-+
-+#define INDEX_UNSET (~(uint32_t)0)
-+
-+enum qent_status {
-+    QENT_NEW = 0,       // Initial state - shouldn't last
-+    QENT_FREE,          // On free chain
-+    QENT_PENDING,       // User has ent
-+    QENT_WAITING,       // On inuse
-+    QENT_DONE,          // Frame rx
-+    QENT_ERROR,         // Error
-+    QENT_IMPORT
-+};
-+
-+struct qent_base {
-+    atomic_int ref_count;
-+    struct qent_base *next;
-+    struct qent_base *prev;
-+    enum qent_status status;
-+    uint32_t index;
-+    struct dmabuf_h *dh[VIDEO_MAX_PLANES];
-+    struct timeval timestamp;
-+};
-+
-+struct qent_src {
-+    struct qent_base base;
-+    int fixed_size;
-+};
-+
-+struct qent_dst {
-+    struct qent_base base;
-+    bool waiting;
-+    pthread_mutex_t lock;
-+    pthread_cond_t cond;
-+    struct ff_weak_link_client * mbc_wl;
-+};
-+
-+struct qe_list_head {
-+    struct qent_base *head;
-+    struct qent_base *tail;
-+};
-+
-+struct buf_pool {
-+    pthread_mutex_t lock;
-+    sem_t free_sem;
-+    enum v4l2_buf_type buf_type;
-+    struct qe_list_head free;
-+    struct qe_list_head inuse;
-+};
-+
-+
-+static inline struct qent_dst *base_to_dst(struct qent_base *be)
-+{
-+    return (struct qent_dst *)be;
-+}
-+
-+static inline struct qent_src *base_to_src(struct qent_base *be)
-+{
-+    return (struct qent_src *)be;
-+}
-+
-+
-+#define QENT_BASE_INITIALIZER {\
-+    .ref_count = ATOMIC_VAR_INIT(0),\
-+    .status = QENT_NEW,\
-+    .index  = INDEX_UNSET\
-+}
-+
-+static void qe_base_uninit(struct qent_base *const be)
-+{
-+    unsigned int i;
-+    for (i = 0; i != VIDEO_MAX_PLANES; ++i) {
-+        dmabuf_free(be->dh[i]);
-+        be->dh[i] = NULL;
-+    }
-+}
-+
-+static void qe_src_free(struct qent_src *const be_src)
-+{
-+    if (!be_src)
-+        return;
-+    qe_base_uninit(&be_src->base);
-+    free(be_src);
-+}
-+
-+static struct qent_src * qe_src_new(void)
-+{
-+    struct qent_src *const be_src = malloc(sizeof(*be_src));
-+    if (!be_src)
-+        return NULL;
-+    *be_src = (struct qent_src){
-+        .base = QENT_BASE_INITIALIZER
-+    };
-+    return be_src;
-+}
-+
-+static void qe_dst_free(struct qent_dst *const be_dst)
-+{
-+    if (!be_dst)
-+        return;
-+
-+    ff_weak_link_unref(&be_dst->mbc_wl);
-+    pthread_cond_destroy(&be_dst->cond);
-+    pthread_mutex_destroy(&be_dst->lock);
-+    qe_base_uninit(&be_dst->base);
-+    free(be_dst);
-+}
-+
-+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
-+{
-+    struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
-+    if (!be_dst)
-+        return NULL;
-+    *be_dst = (struct qent_dst){
-+        .base = QENT_BASE_INITIALIZER,
-+        .lock = PTHREAD_MUTEX_INITIALIZER,
-+        .cond = PTHREAD_COND_INITIALIZER,
-+        .mbc_wl = ff_weak_link_ref(wl)
-+    };
-+    return be_dst;
-+}
-+
-+static void ql_add_tail(struct qe_list_head * const ql, struct qent_base * be)
-+{
-+    if (ql->tail)
-+        ql->tail->next = be;
-+    else
-+        ql->head = be;
-+    be->prev = ql->tail;
-+    be->next = NULL;
-+    ql->tail = be;
-+}
-+
-+static struct qent_base * ql_extract(struct qe_list_head * const ql, struct qent_base * be)
-+{
-+    if (!be)
-+        return NULL;
-+
-+    if (be->next)
-+        be->next->prev = be->prev;
-+    else
-+        ql->tail = be->prev;
-+    if (be->prev)
-+        be->prev->next = be->next;
-+    else
-+        ql->head = be->next;
-+    be->next = NULL;
-+    be->prev = NULL;
-+    return be;
-+}
-+
-+
-+static void bq_put_free(struct buf_pool *const bp, struct qent_base * be)
-+{
-+    ql_add_tail(&bp->free, be);
-+}
-+
-+static struct qent_base * bq_get_free(struct buf_pool *const bp)
-+{
-+    return ql_extract(&bp->free, bp->free.head);
-+}
-+
-+static struct qent_base * bq_extract_inuse(struct buf_pool *const bp, struct qent_base *const be)
-+{
-+    return ql_extract(&bp->inuse, be);
-+}
-+
-+static struct qent_base * bq_get_inuse(struct buf_pool *const bp)
-+{
-+    return ql_extract(&bp->inuse, bp->inuse.head);
-+}
-+
-+static void bq_free_all_free_src(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_free(bp)) != NULL)
-+        qe_src_free(base_to_src(be));
-+}
-+
-+static void bq_free_all_inuse_src(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_inuse(bp)) != NULL)
-+        qe_src_free(base_to_src(be));
-+}
-+
-+static void bq_free_all_free_dst(struct buf_pool *const bp)
-+{
-+    struct qent_base *be;
-+    while ((be = bq_get_free(bp)) != NULL)
-+        qe_dst_free(base_to_dst(be));
-+}
-+
-+static void queue_put_free(struct buf_pool *const bp, struct qent_base *be)
-+{
-+    unsigned int i;
-+
-+    pthread_mutex_lock(&bp->lock);
-+    /* Clear out state vars */
-+    be->timestamp.tv_sec = 0;
-+    be->timestamp.tv_usec = 0;
-+    be->status = QENT_FREE;
-+    for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i)
-+        dmabuf_len_set(be->dh[i], 0);
-+    bq_put_free(bp, be);
-+    pthread_mutex_unlock(&bp->lock);
-+    sem_post(&bp->free_sem);
-+}
-+
-+static bool queue_is_inuse(const struct buf_pool *const bp)
-+{
-+    return bp->inuse.tail != NULL;
-+}
-+
-+static void queue_put_inuse(struct buf_pool *const bp, struct qent_base *be)
-+{
-+    if (!be)
-+        return;
-+    pthread_mutex_lock(&bp->lock);
-+    ql_add_tail(&bp->inuse, be);
-+    be->status = QENT_WAITING;
-+    pthread_mutex_unlock(&bp->lock);
-+}
-+
-+static struct qent_base *queue_get_free(struct buf_pool *const bp)
-+{
-+    struct qent_base *buf;
-+
-+    if (do_wait(&bp->free_sem))
-+        return NULL;
-+    pthread_mutex_lock(&bp->lock);
-+    buf = bq_get_free(bp);
-+    pthread_mutex_unlock(&bp->lock);
-+    return buf;
-+}
-+
-+static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
-+{
-+    struct qent_base *buf;
-+
-+    if (do_trywait(&bp->free_sem))
-+        return NULL;
-+    pthread_mutex_lock(&bp->lock);
-+    buf = bq_get_free(bp);
-+    pthread_mutex_unlock(&bp->lock);
-+    return buf;
-+}
-+
-+static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
-+{
-+    struct qent_base *be;
-+
-+    pthread_mutex_lock(&bp->lock);
-+    /* Expect 1st in Q, but allow anywhere */
-+    for (be = bp->inuse.head; be; be = be->next) {
-+        if (dmabuf_fd(be->dh[0]) == fd) {
-+            bq_extract_inuse(bp, be);
-+            break;
-+        }
-+    }
-+    pthread_mutex_unlock(&bp->lock);
-+
-+    return be;
-+}
-+
-+static void queue_delete(struct buf_pool *const bp)
-+{
-+    sem_destroy(&bp->free_sem);
-+    pthread_mutex_destroy(&bp->lock);
-+    free(bp);
-+}
-+
-+static struct buf_pool* queue_new(const int vfd)
-+{
-+    struct buf_pool *bp = calloc(1, sizeof(*bp));
-+    if (!bp)
-+        return NULL;
-+    pthread_mutex_init(&bp->lock, NULL);
-+    sem_init(&bp->free_sem, 0, 0);
-+    return bp;
-+}
-+
-+
-+struct mediabufs_ctl {
-+    atomic_int ref_count;  /* 0 is single ref for easier atomics */
-+    void * dc;
-+    int vfd;
-+    bool stream_on;
-+    bool polling;
-+    bool dst_fixed;             // Dst Q is fixed size
-+    pthread_mutex_t lock;
-+    struct buf_pool * src;
-+    struct buf_pool * dst;
-+    struct polltask * pt;
-+    struct pollqueue * pq;
-+    struct ff_weak_link_master * this_wlm;
-+
-+    struct v4l2_format src_fmt;
-+    struct v4l2_format dst_fmt;
-+};
-+
-+static int qe_v4l2_queue(struct qent_base *const be,
-+               const int vfd, struct media_request *const mreq,
-+               const struct v4l2_format *const fmt,
-+               const bool is_dst, const bool hold_flag)
-+{
-+    struct v4l2_buffer buffer = {
-+        .type = fmt->type,
-+        .memory = V4L2_MEMORY_DMABUF,
-+        .index = be->index
-+    };
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        unsigned int i;
-+        for (i = 0; i < VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+            if (is_dst)
-+                dmabuf_len_set(be->dh[i], 0);
-+
-+            /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
-+            planes[i].length = dmabuf_size(be->dh[i]);
-+            planes[i].bytesused = dmabuf_len(be->dh[i]);
-+            planes[i].m.fd = dmabuf_fd(be->dh[i]);
-+        }
-+        buffer.m.planes = planes;
-+        buffer.length = i;
-+    }
-+    else {
-+        if (is_dst)
-+            dmabuf_len_set(be->dh[0], 0);
-+
-+        buffer.bytesused = dmabuf_len(be->dh[0]);
-+        buffer.length = dmabuf_size(be->dh[0]);
-+        buffer.m.fd = dmabuf_fd(be->dh[0]);
-+    }
-+
-+    if (!is_dst && mreq) {
-+        buffer.flags |= V4L2_BUF_FLAG_REQUEST_FD;
-+        buffer.request_fd = media_request_fd(mreq);
-+        if (hold_flag)
-+            buffer.flags |= V4L2_BUF_FLAG_M2M_HOLD_CAPTURE_BUF;
-+    }
-+
-+    if (is_dst)
-+        be->timestamp = (struct timeval){0,0};
-+
-+    buffer.timestamp = be->timestamp;
-+
-+    while (ioctl(vfd, VIDIOC_QBUF, &buffer)) {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_log("%s: Failed to Q buffer: err=%d (%s)\n", __func__, err, strerror(err));
-+            return -err;
-+        }
-+    }
-+    return 0;
-+}
-+
-+static struct qent_base * qe_dequeue(struct buf_pool *const bp,
-+                     const int vfd,
-+                     const struct v4l2_format * const f)
-+{
-+    int fd;
-+    struct qent_base *be;
-+    int rc;
-+    const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-+    struct v4l2_buffer buffer = {
-+        .type =  f->type,
-+        .memory = V4L2_MEMORY_DMABUF
-+    };
-+    if (mp) {
-+        buffer.length = f->fmt.pix_mp.num_planes;
-+        buffer.m.planes = planes;
-+    }
-+
-+    while ((rc = ioctl(vfd, VIDIOC_DQBUF, &buffer)) != 0 &&
-+           errno == EINTR)
-+        /* Loop */;
-+    if (rc) {
-+        request_log("Error DQing buffer type %d: %s\n", f->type, strerror(errno));
-+        return NULL;
-+    }
-+
-+    fd = mp ? planes[0].m.fd : buffer.m.fd;
-+    be = queue_find_extract_fd(bp, fd);
-+    if (!be) {
-+        request_log("Failed to find fd %d in Q\n", fd);
-+        return NULL;
-+    }
-+
-+    be->timestamp = buffer.timestamp;
-+    be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
-+    return be;
-+}
-+
-+static void qe_dst_done(struct qent_dst * dst_be)
-+{
-+    pthread_mutex_lock(&dst_be->lock);
-+    dst_be->waiting = false;
-+    pthread_cond_broadcast(&dst_be->cond);
-+    pthread_mutex_unlock(&dst_be->lock);
-+
-+    qent_dst_unref(&dst_be);
-+}
-+
-+static bool qe_dst_waiting(struct qent_dst *const dst_be)
-+{
-+    bool waiting;
-+    pthread_mutex_lock(&dst_be->lock);
-+    waiting = dst_be->waiting;
-+    dst_be->waiting = true;
-+    pthread_mutex_unlock(&dst_be->lock);
-+    return waiting;
-+}
-+
-+
-+static bool mediabufs_wants_poll(const struct mediabufs_ctl *const mbc)
-+{
-+    return queue_is_inuse(mbc->src) || queue_is_inuse(mbc->dst);
-+}
-+
-+static void mediabufs_poll_cb(void * v, short revents)
-+{
-+    struct mediabufs_ctl *mbc = v;
-+    struct qent_src *src_be = NULL;
-+    struct qent_dst *dst_be = NULL;
-+
-+    if (!revents)
-+        request_err(mbc->dc, "%s: Timeout\n", __func__);
-+
-+    pthread_mutex_lock(&mbc->lock);
-+    mbc->polling = false;
-+
-+    if ((revents & POLLOUT) != 0)
-+        src_be = base_to_src(qe_dequeue(mbc->src, mbc->vfd, &mbc->src_fmt));
-+    if ((revents & POLLIN) != 0)
-+        dst_be = base_to_dst(qe_dequeue(mbc->dst, mbc->vfd, &mbc->dst_fmt));
-+
-+    /* Reschedule */
-+    if (mediabufs_wants_poll(mbc)) {
-+        mbc->polling = true;
-+        pollqueue_add_task(mbc->pt, 2000);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+
-+    if (src_be)
-+        queue_put_free(mbc->src, &src_be->base);
-+    if (dst_be)
-+        qe_dst_done(dst_be);
-+}
-+
-+int qent_src_params_set(struct qent_src *const be_src, const struct timeval * timestamp)
-+{
-+    struct qent_base *const be = &be_src->base;
-+
-+    be->timestamp = *timestamp;
-+    return 0;
-+}
-+
-+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst)
-+{
-+    return be_dst->base.timestamp;
-+}
-+
-+static int qent_base_realloc(struct qent_base *const be, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    if (!be->dh[0] || len > dmabuf_size(be->dh[0])) {
-+        size_t newsize = round_up_size(len);
-+        request_log("%s: Overrun %zd > %zd; trying %zd\n", __func__, len, dmabuf_size(be->dh[0]), newsize);
-+        if (!dbsc) {
-+            request_log("%s: No dmbabuf_ctrl for realloc\n", __func__);
-+            return -ENOMEM;
-+        }
-+        if ((be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], newsize)) == NULL) {
-+            request_log("%s: Realloc %zd failed\n", __func__, newsize);
-+            return -ENOMEM;
-+        }
-+    }
-+    return 0;
-+}
-+
-+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    struct qent_base *const be = &be_src->base;
-+    return qent_base_realloc(be, len, dbsc);
-+}
-+
-+
-+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc)
-+{
-+    void * dst;
-+    struct qent_base *const be = &be_src->base;
-+    int rv;
-+
-+    // Realloc doesn't copy so don't alloc if offset != 0
-+    if ((rv = qent_base_realloc(be, offset + len,
-+                                be_src->fixed_size || offset ? NULL : dbsc)) != 0)
-+        return rv;
-+
-+    dmabuf_write_start(be->dh[0]);
-+    dst = dmabuf_map(be->dh[0]);
-+    if (!dst)
-+        return -1;
-+    memcpy((char*)dst + offset, src, len);
-+    dmabuf_len_set(be->dh[0], len);
-+    dmabuf_write_end(be->dh[0]);
-+    return 0;
-+}
-+
-+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be_dst, unsigned int plane)
-+{
-+    const struct qent_base *const be = &be_dst->base;
-+
-+    return (plane >= sizeof(be->dh)/sizeof(be->dh[0])) ? NULL : be->dh[plane];
-+}
-+
-+int qent_dst_dup_fd(const struct qent_dst *const be_dst, unsigned int plane)
-+{
-+    return dup(dmabuf_fd(qent_dst_dmabuf(be_dst, plane)));
-+}
-+
-+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
-+                struct media_request **const pmreq,
-+                struct qent_src **const psrc_be,
-+                struct qent_dst *const dst_be,
-+                const bool is_final)
-+{
-+    struct media_request * mreq = *pmreq;
-+    struct qent_src *const src_be = *psrc_be;
-+
-+    // Req & src are always both "consumed"
-+    *pmreq = NULL;
-+    *psrc_be = NULL;
-+
-+    pthread_mutex_lock(&mbc->lock);
-+
-+    if (!src_be)
-+        goto fail1;
-+
-+    if (dst_be) {
-+        if (qe_dst_waiting(dst_be)) {
-+            request_info(mbc->dc, "Request buffer already waiting on start\n");
-+            goto fail1;
-+        }
-+        dst_be->base.timestamp = (struct timeval){0,0};
-+        if (qe_v4l2_queue(&dst_be->base, mbc->vfd, NULL, &mbc->dst_fmt, true, false))
-+            goto fail1;
-+
-+        qent_dst_ref(dst_be);
-+        queue_put_inuse(mbc->dst, &dst_be->base);
-+    }
-+
-+    if (qe_v4l2_queue(&src_be->base, mbc->vfd, mreq, &mbc->src_fmt, false, !is_final))
-+        goto fail1;
-+    queue_put_inuse(mbc->src, &src_be->base);
-+
-+    if (!mbc->polling && mediabufs_wants_poll(mbc)) {
-+        mbc->polling = true;
-+        pollqueue_add_task(mbc->pt, 2000);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+
-+    if (media_request_start(mreq))
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail1:
-+    media_request_abort(&mreq);
-+    if (src_be)
-+        queue_put_free(mbc->src, &src_be->base);
-+
-+// *** TODO: If src Q fails this doesnt unwind properly - separate dst Q from src Q
-+    if (dst_be) {
-+        dst_be->base.status = QENT_ERROR;
-+        qe_dst_done(dst_be);
-+    }
-+    pthread_mutex_unlock(&mbc->lock);
-+    return MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+
-+static int qe_alloc_from_fmt(struct qent_base *const be,
-+                   struct dmabufs_ctl *const dbsc,
-+                   const struct v4l2_format *const fmt)
-+{
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        unsigned int i;
-+        for (i = 0; i != fmt->fmt.pix_mp.num_planes; ++i) {
-+            be->dh[i] = dmabuf_realloc(dbsc, be->dh[i],
-+                fmt->fmt.pix_mp.plane_fmt[i].sizeimage);
-+            /* On failure tidy up and die */
-+            if (!be->dh[i]) {
-+                while (i--) {
-+                    dmabuf_free(be->dh[i]);
-+                    be->dh[i] = NULL;
-+                }
-+                return -1;
-+            }
-+        }
-+    }
-+    else {
-+//      be->dh[0] = dmabuf_alloc(dbsc, fmt->fmt.pix.sizeimage);
-+        size_t size = fmt->fmt.pix.sizeimage;
-+        be->dh[0] = dmabuf_realloc(dbsc, be->dh[0], size);
-+        if (!be->dh[0])
-+            return -1;
-+    }
-+    return 0;
-+}
-+
-+static MediaBufsStatus fmt_set(struct v4l2_format *const fmt, const int fd,
-+            const enum v4l2_buf_type buftype,
-+            uint32_t pixfmt,
-+            const unsigned int width, const unsigned int height,
-+                               const size_t bufsize)
-+{
-+    *fmt = (struct v4l2_format){.type = buftype};
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
-+        fmt->fmt.pix_mp.width = width;
-+        fmt->fmt.pix_mp.height = height;
-+        fmt->fmt.pix_mp.pixelformat = pixfmt;
-+        if (bufsize) {
-+            fmt->fmt.pix_mp.num_planes = 1;
-+            fmt->fmt.pix_mp.plane_fmt[0].sizeimage = bufsize;
-+        }
-+    }
-+    else {
-+        fmt->fmt.pix.width = width;
-+        fmt->fmt.pix.height = height;
-+        fmt->fmt.pix.pixelformat = pixfmt;
-+        fmt->fmt.pix.sizeimage = bufsize;
-+    }
-+
-+    while (ioctl(fd, VIDIOC_S_FMT, fmt))
-+        if (errno != EINTR)
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    // Treat anything where we don't get at least what we asked for as a fail
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buftype)) {
-+        if (fmt->fmt.pix_mp.width < width ||
-+            fmt->fmt.pix_mp.height < height ||
-+            fmt->fmt.pix_mp.pixelformat != pixfmt) {
-+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+    }
-+    else {
-+        if (fmt->fmt.pix.width < width ||
-+            fmt->fmt.pix.height < height ||
-+            fmt->fmt.pix.pixelformat != pixfmt) {
-+            return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+    }
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+static MediaBufsStatus find_fmt_flags(struct v4l2_format *const fmt,
-+                   const int fd,
-+                   const unsigned int type_v4l2,
-+                   const uint32_t flags_must,
-+                   const uint32_t flags_not,
-+                   const unsigned int width,
-+                   const unsigned int height,
-+                   mediabufs_dst_fmt_accept_fn *const accept_fn,
-+                   void *const accept_v)
-+{
-+    unsigned int i;
-+
-+    for (i = 0;; ++i) {
-+        struct v4l2_fmtdesc fmtdesc = {
-+            .index = i,
-+            .type = type_v4l2
-+        };
-+        while (ioctl(fd, VIDIOC_ENUM_FMT, &fmtdesc)) {
-+            if (errno != EINTR)
-+                return MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE;
-+        }
-+        if ((fmtdesc.flags & flags_must) != flags_must ||
-+            (fmtdesc.flags & flags_not))
-+            continue;
-+        if (!accept_fn(accept_v, &fmtdesc))
-+            continue;
-+
-+        if (fmt_set(fmt, fd, fmtdesc.type, fmtdesc.pixelformat,
-+                width, height, 0) == MEDIABUFS_STATUS_SUCCESS)
-+            return MEDIABUFS_STATUS_SUCCESS;
-+    }
-+    return 0;
-+}
-+
-+
-+/* Wait for qent done */
-+
-+MediaBufsStatus qent_dst_wait(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    enum qent_status estat;
-+
-+    pthread_mutex_lock(&be_dst->lock);
-+    while (be_dst->waiting &&
-+           !pthread_cond_wait(&be_dst->cond, &be_dst->lock))
-+        /* Loop */;
-+    estat = be->status;
-+    pthread_mutex_unlock(&be_dst->lock);
-+
-+    return estat == QENT_DONE ? MEDIABUFS_STATUS_SUCCESS :
-+        estat == QENT_ERROR ? MEDIABUFS_ERROR_DECODING_ERROR :
-+            MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+const uint8_t * qent_dst_data(struct qent_dst *const be_dst, unsigned int buf_no)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    return dmabuf_map(be->dh[buf_no]);
-+}
-+
-+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    unsigned int i;
-+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+        if (dmabuf_read_start(be->dh[i])) {
-+            while (i--)
-+                dmabuf_read_end(be->dh[i]);
-+            return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+        }
-+    }
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be_dst)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    unsigned int i;
-+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
-+
-+    for (i = 0; i != VIDEO_MAX_PLANES && be->dh[i]; ++i) {
-+        if (dmabuf_read_end(be->dh[i]))
-+            status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+    return status;
-+}
-+
-+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst)
-+{
-+    if (be_dst)
-+        atomic_fetch_add(&be_dst->base.ref_count, 1);
-+    return be_dst;
-+}
-+
-+void qent_dst_unref(struct qent_dst ** const pbe_dst)
-+{
-+    struct qent_dst * const be_dst = *pbe_dst;
-+    struct mediabufs_ctl * mbc;
-+    if (!be_dst)
-+        return;
-+    *pbe_dst = NULL;
-+
-+    if (atomic_fetch_sub(&be_dst->base.ref_count, 1) != 0)
-+        return;
-+
-+    if ((mbc = ff_weak_link_lock(&be_dst->mbc_wl)) != NULL) {
-+        queue_put_free(mbc->dst, &be_dst->base);
-+        ff_weak_link_unlock(be_dst->mbc_wl);
-+    }
-+    else {
-+        qe_dst_free(be_dst);
-+    }
-+}
-+
-+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
-+                unsigned int plane,
-+                int fd, size_t size)
-+{
-+    struct qent_base *const be = &be_dst->base;
-+    struct dmabuf_h * dh;
-+
-+    if (be->status != QENT_IMPORT || be->dh[plane])
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    dh = dmabuf_import(fd, size);
-+    if (!dh)
-+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+
-+    be->dh[plane] = dh;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+// Returns noof buffers created, -ve for error
-+static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, struct qent_dst * const qes[])
-+{
-+    unsigned int i;
-+
-+    struct v4l2_create_buffers cbuf = {
-+        .count = n,
-+        .memory = V4L2_MEMORY_DMABUF,
-+        .format = mbc->dst_fmt,
-+    };
-+
-+    while (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf)) {
-+        const int err = -errno;
-+        if (err != EINTR) {
-+            request_err(mbc->dc, "%s: Failed to create V4L2 buffer\n", __func__);
-+            return -err;
-+        }
-+    }
-+
-+    if (cbuf.count != n)
-+        request_warn(mbc->dc, "%s: Created %d of %d V4L2 buffers requested\n", __func__, cbuf.count, n);
-+
-+    for (i = 0; i != cbuf.count; ++i)
-+        qes[i]->base.index = cbuf.index + i;
-+
-+    return cbuf.count;
-+}
-+
-+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
-+{
-+    struct qent_dst * be_dst;
-+
-+    if (mbc == NULL) {
-+        be_dst = qe_dst_new(NULL);
-+        if (be_dst)
-+            be_dst->base.status = QENT_IMPORT;
-+        return be_dst;
-+    }
-+
-+    if (mbc->dst_fixed) {
-+        be_dst = base_to_dst(queue_get_free(mbc->dst));
-+        if (!be_dst)
-+            return NULL;
-+    }
-+    else {
-+        be_dst = base_to_dst(queue_tryget_free(mbc->dst));
-+        if (!be_dst) {
-+            be_dst = qe_dst_new(mbc->this_wlm);
-+            if (!be_dst)
-+                return NULL;
-+
-+            if (create_dst_bufs(mbc, 1, &be_dst) != 1) {
-+                qe_dst_free(be_dst);
-+                return NULL;
-+            }
-+        }
-+    }
-+
-+    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
-+        /* Given  how create buf works we can't uncreate it on alloc failure
-+         * all we can do is put it on the free Q
-+        */
-+        queue_put_free(mbc->dst, &be_dst->base);
-+        return NULL;
-+    }
-+
-+    be_dst->base.status = QENT_PENDING;
-+    atomic_store(&be_dst->base.ref_count, 0);
-+    return be_dst;
-+}
-+
-+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc)
-+{
-+    return &mbc->dst_fmt;
-+}
-+
-+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
-+               const unsigned int width,
-+               const unsigned int height,
-+               mediabufs_dst_fmt_accept_fn *const accept_fn,
-+               void *const accept_v)
-+{
-+    MediaBufsStatus status;
-+    unsigned int i;
-+    const enum v4l2_buf_type buf_type = mbc->dst_fmt.type;
-+    static const struct {
-+        unsigned int flags_must;
-+        unsigned int flags_not;
-+    } trys[] = {
-+        {0, V4L2_FMT_FLAG_EMULATED},
-+        {V4L2_FMT_FLAG_EMULATED, 0},
-+    };
-+    for (i = 0; i != sizeof(trys)/sizeof(trys[0]); ++i) {
-+        status = find_fmt_flags(&mbc->dst_fmt, mbc->vfd,
-+                                buf_type,
-+                                trys[i].flags_must,
-+                                trys[i].flags_not,
-+                                width, height, accept_fn, accept_v);
-+        if (status != MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE)
-+            return status;
-+    }
-+
-+    if (status != MEDIABUFS_STATUS_SUCCESS)
-+        return status;
-+
-+    /* Try to create a buffer - don't alloc */
-+    return status;
-+}
-+
-+// ** This is a mess if we get partial alloc but without any way to remove
-+//    individual V4L2 Q members we are somewhat stuffed
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
-+{
-+    unsigned int i;
-+    int a = 0;
-+    unsigned int qc;
-+    struct qent_dst * qes[32];
-+
-+    if (n > 32)
-+        return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+
-+    // Create qents first as it is hard to get rid of the V4L2 buffers on error
-+    for (qc = 0; qc != n; ++qc)
-+    {
-+        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
-+            goto fail;
-+    }
-+
-+    if ((a = create_dst_bufs(mbc, n, qes)) < 0)
-+        goto fail;
-+
-+    for (i = 0; i != a; ++i)
-+        queue_put_free(mbc->dst, &qes[i]->base);
-+
-+    if (a != n)
-+        goto fail;
-+
-+    mbc->dst_fixed = fixed;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail:
-+    for (i = (a < 0 ? 0 : a); i != qc; ++i)
-+        qe_dst_free(qes[i]);
-+
-+    return MEDIABUFS_ERROR_ALLOCATION_FAILED;
-+}
-+
-+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc)
-+{
-+    struct qent_base * buf = queue_get_free(mbc->src);
-+    buf->status = QENT_PENDING;
-+    return base_to_src(buf);
-+}
-+
-+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src)
-+{
-+    struct qent_src *const qe_src = *pqe_src;
-+    if (!qe_src)
-+        return;
-+    *pqe_src = NULL;
-+    queue_put_free(mbc->src, &qe_src->base);
-+}
-+
-+/* src format must have been set up before this */
-+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
-+                  struct dmabufs_ctl * const dbsc,
-+                  unsigned int n)
-+{
-+    unsigned int i;
-+    struct v4l2_requestbuffers req = {
-+        .count = n,
-+        .type = mbc->src_fmt.type,
-+        .memory = V4L2_MEMORY_DMABUF
-+    };
-+
-+    bq_free_all_free_src(mbc->src);
-+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
-+        if (errno != EINTR) {
-+            request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+        }
-+    }
-+
-+    if (n > req.count) {
-+        request_info(mbc->dc, "Only allocated %d of %d src buffers requested\n", req.count, n);
-+        n = req.count;
-+    }
-+
-+    for (i = 0; i != n; ++i) {
-+        struct qent_src *const be_src = qe_src_new();
-+        if (!be_src) {
-+            request_err(mbc->dc, "Failed to create src be %d\n", i);
-+            goto fail;
-+        }
-+        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
-+            qe_src_free(be_src);
-+            goto fail;
-+        }
-+        be_src->base.index = i;
-+        be_src->fixed_size = !mediabufs_src_resizable(mbc);
-+
-+        queue_put_free(mbc->src, &be_src->base);
-+    }
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+
-+fail:
-+    bq_free_all_free_src(mbc->src);
-+    req.count = 0;
-+    while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1 &&
-+           errno == EINTR)
-+        /* Loop */;
-+
-+    return MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+
-+
-+/*
-+ * Set stuff order:
-+ *  Set src fmt
-+ *  Set parameters (sps) on vfd
-+ *  Negotiate dst format (dst_fmt_set)
-+ *  Create src buffers
-+ *  Alloc a dst buffer or Create dst slots
-+*/
-+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc)
-+{
-+    if (mbc->stream_on)
-+        return MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (set_stream(mbc->vfd, mbc->src_fmt.type, true) < 0) {
-+        request_log("Failed to set stream on src type %d\n", mbc->src_fmt.type);
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, true) < 0) {
-+        request_log("Failed to set stream on dst type %d\n", mbc->dst_fmt.type);
-+        set_stream(mbc->vfd, mbc->src_fmt.type, false);
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    mbc->stream_on = true;
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc)
-+{
-+    MediaBufsStatus status = MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (!mbc->stream_on)
-+        return MEDIABUFS_STATUS_SUCCESS;
-+
-+    if (set_stream(mbc->vfd, mbc->dst_fmt.type, false) < 0) {
-+        request_log("Failed to set stream off dst type %d\n", mbc->dst_fmt.type);
-+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    if (set_stream(mbc->vfd, mbc->src_fmt.type, false) < 0) {
-+        request_log("Failed to set stream off src type %d\n", mbc->src_fmt.type);
-+        status = MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    mbc->stream_on = false;
-+    return status;
-+}
-+
-+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq, struct v4l2_ext_control control_array[], unsigned int n)
-+{
-+    struct v4l2_ext_controls controls = {
-+        .controls = control_array,
-+        .count = n
-+    };
-+
-+    if (mreq) {
-+        controls.which = V4L2_CTRL_WHICH_REQUEST_VAL;
-+        controls.request_fd = media_request_fd(mreq);
-+    }
-+
-+    while (ioctl(mbc->vfd, VIDIOC_S_EXT_CTRLS, &controls))
-+    {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_err(mbc->dc, "Unable to set controls: %s\n", strerror(err));
-+            return -err;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
-+                struct media_request * const mreq,
-+                unsigned int id, void *data,
-+                unsigned int size)
-+{
-+    struct v4l2_ext_control control = {
-+        .id = id,
-+        .ptr = data,
-+        .size = size
-+    };
-+
-+    int rv = mediabufs_ctl_set_ext_ctrls(mbc, mreq, &control, 1);
-+    return !rv ? MEDIABUFS_STATUS_SUCCESS : MEDIABUFS_ERROR_OPERATION_FAILED;
-+}
-+
-+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
-+                                      enum v4l2_buf_type buf_type,
-+                   const uint32_t pixfmt,
-+                   const uint32_t width, const uint32_t height,
-+                                      const size_t bufsize)
-+{
-+    MediaBufsStatus rv = fmt_set(&mbc->src_fmt, mbc->vfd, buf_type, pixfmt, width, height, bufsize);
-+    if (rv != MEDIABUFS_STATUS_SUCCESS)
-+        request_err(mbc->dc, "Failed to set src buftype %d, format %#x %dx%d\n", buf_type, pixfmt, width, height);
-+
-+    return rv;
-+}
-+
-+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n)
-+{
-+    int rv = 0;
-+    while (n--) {
-+        while (ioctl(mbc->vfd, VIDIOC_QUERY_EXT_CTRL, ctrls)) {
-+            const int err = errno;
-+            if (err != EINTR) {
-+                // Often used for probing - errors are to be expected
-+                request_debug(mbc->dc, "Failed to query ext id=%#x, err=%d\n", ctrls->id, err);
-+                ctrls->type = 0; // 0 is invalid
-+                rv = -err;
-+                break;
-+            }
-+        }
-+        ++ctrls;
-+    }
-+    return rv;
-+}
-+
-+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
-+{
-+    // Single planar OUTPUT can only take exact size buffers
-+    // Multiplanar will take larger than negotiated
-+    return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
-+}
-+
-+static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
-+{
-+    if (!mbc)
-+        return;
-+
-+    // Break the weak link first
-+    ff_weak_link_break(&mbc->this_wlm);
-+
-+    polltask_delete(&mbc->pt);
-+
-+    mediabufs_stream_off(mbc);
-+
-+    // Empty v4l2 buffer stash
-+    request_buffers(mbc->vfd, mbc->src_fmt.type, V4L2_MEMORY_MMAP, 0);
-+    request_buffers(mbc->vfd, mbc->dst_fmt.type, V4L2_MEMORY_MMAP, 0);
-+
-+    bq_free_all_free_src(mbc->src);
-+    bq_free_all_inuse_src(mbc->src);
-+    bq_free_all_free_dst(mbc->dst);
-+
-+    {
-+        struct qent_dst *dst_be;
-+        while ((dst_be = base_to_dst(bq_get_inuse(mbc->dst))) != NULL) {
-+            dst_be->base.timestamp = (struct timeval){0};
-+            dst_be->base.status = QENT_ERROR;
-+            qe_dst_done(dst_be);
-+        }
-+    }
-+
-+    queue_delete(mbc->dst);
-+    queue_delete(mbc->src);
-+    close(mbc->vfd);
-+    pthread_mutex_destroy(&mbc->lock);
-+
-+    free(mbc);
-+}
-+
-+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc)
-+{
-+    atomic_fetch_add(&mbc->ref_count, 1);
-+    return mbc;
-+}
-+
-+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
-+{
-+    struct mediabufs_ctl *const mbc = *pmbc;
-+    int n;
-+
-+    if (!mbc)
-+        return;
-+    *pmbc = NULL;
-+    n = atomic_fetch_sub(&mbc->ref_count, 1);
-+    if (n)
-+        return;
-+    mediabufs_ctl_delete(mbc);
-+}
-+
-+static int set_capabilities(struct mediabufs_ctl *const mbc)
-+{
-+    struct v4l2_capability capability = { 0 };
-+    uint32_t caps;
-+
-+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
-+        int err = errno;
-+        request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
-+        return -err;
-+    }
-+
-+    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-+            capability.device_caps :
-+            capability.capabilities;
-+
-+    if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
-+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+    }
-+    else if ((caps & V4L2_CAP_VIDEO_M2M) != 0) {
-+        mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+        mbc->dst_fmt.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    }
-+    else {
-+        request_err(mbc->dc, "No M2M capabilities (%#x)\n", caps);
-+        return -EINVAL;
-+    }
-+
-+    return 0;
-+}
-+
-+/* One of these per context */
-+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc, const char * vpath, struct pollqueue *const pq)
-+{
-+    struct mediabufs_ctl *const mbc = calloc(1, sizeof(*mbc));
-+
-+    if (!mbc)
-+        return NULL;
-+
-+    mbc->dc = dc;
-+    // Default mono planar
-+    mbc->pq = pq;
-+    pthread_mutex_init(&mbc->lock, NULL);
-+
-+    /* Pick a default  - could we scan for this? */
-+    if (vpath == NULL)
-+        vpath = "/dev/media0";
-+
-+    while ((mbc->vfd = open(vpath, O_RDWR)) == -1)
-+    {
-+        const int err = errno;
-+        if (err != EINTR) {
-+            request_err(dc, "Failed to open video dev '%s': %s\n", vpath, strerror(err));
-+            goto fail0;
-+        }
-+    }
-+
-+    if (set_capabilities(mbc)) {
-+        request_err(dc, "Bad capabilities for video dev '%s'\n", vpath);
-+        goto fail1;
-+    }
-+
-+    mbc->src = queue_new(mbc->vfd);
-+    if (!mbc->src)
-+        goto fail1;
-+    mbc->dst = queue_new(mbc->vfd);
-+    if (!mbc->dst)
-+        goto fail2;
-+    mbc->pt = polltask_new(pq, mbc->vfd, POLLIN | POLLOUT, mediabufs_poll_cb, mbc);
-+    if (!mbc->pt)
-+        goto fail3;
-+    mbc->this_wlm = ff_weak_link_new(mbc);
-+    if (!mbc->this_wlm)
-+        goto fail4;
-+
-+    /* Cannot add polltask now - polling with nothing pending
-+     * generates infinite error polls
-+    */
-+    return mbc;
-+
-+fail4:
-+    polltask_delete(&mbc->pt);
-+fail3:
-+    queue_delete(mbc->dst);
-+fail2:
-+    queue_delete(mbc->src);
-+fail1:
-+    close(mbc->vfd);
-+fail0:
-+    free(mbc);
-+    request_info(dc, "%s: FAILED\n", __func__);
-+    return NULL;
-+}
-+
-+
-+
-diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
-new file mode 100644
-index 0000000000..2f826cfb14
---- /dev/null
-+++ b/libavcodec/v4l2_req_media.h
-@@ -0,0 +1,151 @@
-+/*
-+e.h
-+*
-+ * Permission is hereby granted, free of charge, to any person obtaining a
-+ * copy of this software and associated documentation files (the
-+ * "Software"), to deal in the Software without restriction, including
-+ * without limitation the rights to use, copy, modify, merge, publish,
-+ * distribute, sub license, and/or sell copies of the Software, and to
-+ * permit persons to whom the Software is furnished to do so, subject to
-+ * the following conditions:
-+ *
-+ * The above copyright notice and this permission notice (including the
-+ * next paragraph) shall be included in all copies or substantial portions
-+ * of the Software.
-+ *
-+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
-+ * IN NO EVENT SHALL PRECISION INSIGHT AND/OR ITS SUPPLIERS BE LIABLE FOR
-+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-+ */
-+
-+#ifndef _MEDIA_H_
-+#define _MEDIA_H_
-+
-+#include <stdbool.h>
-+#include <stdint.h>
-+
-+struct v4l2_format;
-+struct v4l2_fmtdesc;
-+struct v4l2_query_ext_ctrl;
-+
-+struct pollqueue;
-+struct media_request;
-+struct media_pool;
-+
-+typedef enum media_buf_status {
-+    MEDIABUFS_STATUS_SUCCESS = 0,
-+    MEDIABUFS_ERROR_OPERATION_FAILED,
-+    MEDIABUFS_ERROR_DECODING_ERROR,
-+    MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
-+    MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
-+    MEDIABUFS_ERROR_ALLOCATION_FAILED,
-+} MediaBufsStatus;
-+
-+struct media_pool * media_pool_new(const char * const media_path,
-+                   struct pollqueue * const pq,
-+                   const unsigned int n);
-+void media_pool_delete(struct media_pool ** pmp);
-+
-+// Obtain a media request
-+// Will block if none availible - has a 2sec timeout
-+struct media_request * media_request_get(struct media_pool * const mp);
-+int media_request_fd(const struct media_request * const req);
-+
-+// Start this request
-+// Request structure is returned to pool once done
-+int media_request_start(struct media_request * const req);
-+
-+// Return an *unstarted* media_request to the pool
-+// May later be upgraded to allow for aborting a started req
-+int media_request_abort(struct media_request ** const preq);
-+
-+
-+struct mediabufs_ctl;
-+struct qent_src;
-+struct qent_dst;
-+struct dmabuf_h;
-+struct dmabufs_ctl;
-+
-+int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
-+struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
-+
-+// prealloc
-+int qent_src_alloc(struct qent_src *const be_src, const size_t len, struct dmabufs_ctl * dbsc);
-+// dbsc may be NULL if realloc not required
-+int qent_src_data_copy(struct qent_src *const be_src, const size_t offset, const void *const src, const size_t len, struct dmabufs_ctl * dbsc);
-+const struct dmabuf_h * qent_dst_dmabuf(const struct qent_dst *const be, unsigned int plane);
-+int qent_dst_dup_fd(const struct qent_dst *const be, unsigned int plane);
-+MediaBufsStatus qent_dst_wait(struct qent_dst *const be);
-+void qent_dst_delete(struct qent_dst *const be);
-+// Returns a qent_dst to its mbc free Q or deletes it if the mbc is dead
-+void qent_dst_unref(struct qent_dst ** const pbe_dst);
-+struct qent_dst * qent_dst_ref(struct qent_dst * const be_dst);
-+
-+const uint8_t * qent_dst_data(struct qent_dst *const be, unsigned int buf_no);
-+MediaBufsStatus qent_dst_read_start(struct qent_dst *const be);
-+MediaBufsStatus qent_dst_read_stop(struct qent_dst *const be);
-+/* Import an fd unattached to any mediabuf */
-+MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
-+                unsigned int plane,
-+                int fd, size_t size);
-+
-+MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
-+                struct media_request **const pmreq,
-+                struct qent_src **const psrc_be,
-+                struct qent_dst *const dst_be,
-+                const bool is_final);
-+// Get / alloc a dst buffer & associate with a slot
-+// If the dst pool is empty then behaviour depends on the fixed flag passed to
-+// dst_slots_create.  Default is !fixed = unlimited alloc
-+struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
-+                           struct dmabufs_ctl *const dbsc);
-+// Create dst slots without alloc
-+// If fixed true then qent_alloc will only get slots from this pool and will
-+// block until a qent has been unrefed
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
-+
-+MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
-+MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
-+const struct v4l2_format *mediabufs_dst_fmt(struct mediabufs_ctl *const mbc);
-+
-+typedef int mediabufs_dst_fmt_accept_fn(void * v, const struct v4l2_fmtdesc *fmtdesc);
-+
-+MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
-+               const unsigned int width,
-+               const unsigned int height,
-+               mediabufs_dst_fmt_accept_fn *const accept_fn,
-+               void *const accept_v);
-+struct qent_src *mediabufs_src_qent_get(struct mediabufs_ctl *const mbc);
-+void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src **const pqe_src);
-+
-+int mediabufs_ctl_set_ext_ctrls(struct mediabufs_ctl * mbc, struct media_request * const mreq,
-+                                struct v4l2_ext_control control_array[], unsigned int n);
-+MediaBufsStatus mediabufs_set_ext_ctrl(struct mediabufs_ctl *const mbc,
-+                struct media_request * const mreq,
-+                unsigned int id, void *data,
-+                unsigned int size);
-+int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_ext_ctrl ctrls[], unsigned int n);
-+
-+int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc);
-+
-+MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
-+                                      enum v4l2_buf_type buf_type,
-+                                      const uint32_t pixfmt,
-+                                      const uint32_t width, const uint32_t height,
-+                                      const size_t bufsize);
-+
-+MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
-+                  struct dmabufs_ctl * const dbsc,
-+                  unsigned int n);
-+
-+struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
-+                     const char *vpath, struct pollqueue *const pq);
-+void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
-+struct mediabufs_ctl * mediabufs_ctl_ref(struct mediabufs_ctl *const mbc);
-+
-+
-+#endif
-diff --git a/libavcodec/v4l2_req_pollqueue.c b/libavcodec/v4l2_req_pollqueue.c
-new file mode 100644
-index 0000000000..cc8a5d4001
---- /dev/null
-+++ b/libavcodec/v4l2_req_pollqueue.c
-@@ -0,0 +1,361 @@
-+#include <errno.h>
-+#include <limits.h>
-+#include <poll.h>
-+#include <pthread.h>
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <stdbool.h>
-+#include <stdlib.h>
-+#include <stdint.h>
-+#include <stdio.h>
-+#include <string.h>
-+#include <unistd.h>
-+#include <sys/eventfd.h>
-+
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_utils.h"
-+
-+
-+struct pollqueue;
-+
-+enum polltask_state {
-+    POLLTASK_UNQUEUED = 0,
-+    POLLTASK_QUEUED,
-+    POLLTASK_RUNNING,
-+    POLLTASK_Q_KILL,
-+    POLLTASK_RUN_KILL,
-+};
-+
-+struct polltask {
-+    struct polltask *next;
-+    struct polltask *prev;
-+    struct pollqueue *q;
-+    enum polltask_state state;
-+
-+    int fd;
-+    short events;
-+
-+    void (*fn)(void *v, short revents);
-+    void * v;
-+
-+    uint64_t timeout; /* CLOCK_MONOTONIC time, 0 => never */
-+    sem_t kill_sem;
-+};
-+
-+struct pollqueue {
-+    atomic_int ref_count;
-+    pthread_mutex_t lock;
-+
-+    struct polltask *head;
-+    struct polltask *tail;
-+
-+    bool kill;
-+    bool no_prod;
-+    int prod_fd;
-+    struct polltask *prod_pt;
-+    pthread_t worker;
-+};
-+
-+struct polltask *polltask_new(struct pollqueue *const pq,
-+                              const int fd, const short events,
-+                  void (*const fn)(void *v, short revents),
-+                  void *const v)
-+{
-+    struct polltask *pt;
-+
-+    if (!events)
-+        return NULL;
-+
-+    pt = malloc(sizeof(*pt));
-+    if (!pt)
-+        return NULL;
-+
-+    *pt = (struct polltask){
-+        .next = NULL,
-+        .prev = NULL,
-+        .q = pollqueue_ref(pq),
-+        .fd = fd,
-+        .events = events,
-+        .fn = fn,
-+        .v = v
-+    };
-+
-+    sem_init(&pt->kill_sem, 0, 0);
-+
-+    return pt;
-+}
-+
-+static void pollqueue_rem_task(struct pollqueue *const pq, struct polltask *const pt)
-+{
-+    if (pt->prev)
-+        pt->prev->next = pt->next;
-+    else
-+        pq->head = pt->next;
-+    if (pt->next)
-+        pt->next->prev = pt->prev;
-+    else
-+        pq->tail = pt->prev;
-+    pt->next = NULL;
-+    pt->prev = NULL;
-+}
-+
-+static void polltask_free(struct polltask * const pt)
-+{
-+    sem_destroy(&pt->kill_sem);
-+    free(pt);
-+}
-+
-+static int pollqueue_prod(const struct pollqueue *const pq)
-+{
-+    static const uint64_t one = 1;
-+    return write(pq->prod_fd, &one, sizeof(one));
-+}
-+
-+void polltask_delete(struct polltask **const ppt)
-+{
-+    struct polltask *const pt = *ppt;
-+    struct pollqueue * pq;
-+    enum polltask_state state;
-+    bool prodme;
-+
-+    if (!pt)
-+        return;
-+
-+    pq = pt->q;
-+    pthread_mutex_lock(&pq->lock);
-+    state = pt->state;
-+    pt->state = (state == POLLTASK_RUNNING) ? POLLTASK_RUN_KILL : POLLTASK_Q_KILL;
-+    prodme = !pq->no_prod;
-+    pthread_mutex_unlock(&pq->lock);
-+
-+    if (state != POLLTASK_UNQUEUED) {
-+        if (prodme)
-+            pollqueue_prod(pq);
-+        while (sem_wait(&pt->kill_sem) && errno == EINTR)
-+            /* loop */;
-+    }
-+
-+    // Leave zapping the ref until we have DQed the PT as might well be
-+    // legitimately used in it
-+    *ppt = NULL;
-+    polltask_free(pt);
-+    pollqueue_unref(&pq);
-+}
-+
-+static uint64_t pollqueue_now(int timeout)
-+{
-+    struct timespec now;
-+    uint64_t now_ms;
-+
-+    if (clock_gettime(CLOCK_MONOTONIC, &now))
-+        return 0;
-+    now_ms = (now.tv_nsec / 1000000) + (uint64_t)now.tv_sec * 1000 + timeout;
-+    return now_ms ? now_ms : (uint64_t)1;
-+}
-+
-+void pollqueue_add_task(struct polltask *const pt, const int timeout)
-+{
-+    bool prodme = false;
-+    struct pollqueue * const pq = pt->q;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    if (pt->state != POLLTASK_Q_KILL && pt->state != POLLTASK_RUN_KILL) {
-+        if (pq->tail)
-+            pq->tail->next = pt;
-+        else
-+            pq->head = pt;
-+        pt->prev = pq->tail;
-+        pt->next = NULL;
-+        pt->state = POLLTASK_QUEUED;
-+        pt->timeout = timeout < 0 ? 0 : pollqueue_now(timeout);
-+        pq->tail = pt;
-+        prodme = !pq->no_prod;
-+    }
-+    pthread_mutex_unlock(&pq->lock);
-+    if (prodme)
-+        pollqueue_prod(pq);
-+}
-+
-+static void *poll_thread(void *v)
-+{
-+    struct pollqueue *const pq = v;
-+    struct pollfd *a = NULL;
-+    size_t asize = 0;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    do {
-+        unsigned int i;
-+        unsigned int n = 0;
-+        struct polltask *pt;
-+        struct polltask *pt_next;
-+        uint64_t now = pollqueue_now(0);
-+        int timeout = -1;
-+        int rv;
-+
-+        for (pt = pq->head; pt; pt = pt_next) {
-+            int64_t t;
-+
-+            pt_next = pt->next;
-+
-+            if (pt->state == POLLTASK_Q_KILL) {
-+                pollqueue_rem_task(pq, pt);
-+                sem_post(&pt->kill_sem);
-+                continue;
-+            }
-+
-+            if (n >= asize) {
-+                asize = asize ? asize * 2 : 4;
-+                a = realloc(a, asize * sizeof(*a));
-+                if (!a) {
-+                    request_log("Failed to realloc poll array to %zd\n", asize);
-+                    goto fail_locked;
-+                }
-+            }
-+
-+            a[n++] = (struct pollfd){
-+                .fd = pt->fd,
-+                .events = pt->events
-+            };
-+
-+            t = (int64_t)(pt->timeout - now);
-+            if (pt->timeout && t < INT_MAX &&
-+                (timeout < 0 || (int)t < timeout))
-+                timeout = (t < 0) ? 0 : (int)t;
-+        }
-+        pthread_mutex_unlock(&pq->lock);
-+
-+        if ((rv = poll(a, n, timeout)) == -1) {
-+            if (errno != EINTR) {
-+                request_log("Poll error: %s\n", strerror(errno));
-+                goto fail_unlocked;
-+            }
-+        }
-+
-+        pthread_mutex_lock(&pq->lock);
-+        now = pollqueue_now(0);
-+
-+        /* Prodding in this loop is pointless and might lead to
-+         * infinite looping
-+        */
-+        pq->no_prod = true;
-+        for (i = 0, pt = pq->head; i < n; ++i, pt = pt_next) {
-+            pt_next = pt->next;
-+
-+            /* Pending? */
-+            if (a[i].revents ||
-+                (pt->timeout && (int64_t)(now - pt->timeout) >= 0)) {
-+                pollqueue_rem_task(pq, pt);
-+                if (pt->state == POLLTASK_QUEUED)
-+                    pt->state = POLLTASK_RUNNING;
-+                if (pt->state == POLLTASK_Q_KILL)
-+                    pt->state = POLLTASK_RUN_KILL;
-+                pthread_mutex_unlock(&pq->lock);
-+
-+                /* This can add new entries to the Q but as
-+                 * those are added to the tail our existing
-+                 * chain remains intact
-+                */
-+                pt->fn(pt->v, a[i].revents);
-+
-+                pthread_mutex_lock(&pq->lock);
-+                if (pt->state == POLLTASK_RUNNING)
-+                    pt->state = POLLTASK_UNQUEUED;
-+                if (pt->state == POLLTASK_RUN_KILL)
-+                    sem_post(&pt->kill_sem);
-+            }
-+        }
-+        pq->no_prod = false;
-+
-+    } while (!pq->kill);
-+
-+fail_locked:
-+    pthread_mutex_unlock(&pq->lock);
-+fail_unlocked:
-+    free(a);
-+    return NULL;
-+}
-+
-+static void prod_fn(void *v, short revents)
-+{
-+    struct pollqueue *const pq = v;
-+    char buf[8];
-+    if (revents)
-+        read(pq->prod_fd, buf, 8);
-+    if (!pq->kill)
-+        pollqueue_add_task(pq->prod_pt, -1);
-+}
-+
-+struct pollqueue * pollqueue_new(void)
-+{
-+    struct pollqueue *pq = malloc(sizeof(*pq));
-+    if (!pq)
-+        return NULL;
-+    *pq = (struct pollqueue){
-+        .ref_count = ATOMIC_VAR_INIT(0),
-+        .lock = PTHREAD_MUTEX_INITIALIZER,
-+        .head = NULL,
-+        .tail = NULL,
-+        .kill = false,
-+        .prod_fd = -1
-+    };
-+
-+    pq->prod_fd = eventfd(0, EFD_NONBLOCK);
-+    if (pq->prod_fd == 1)
-+        goto fail1;
-+    pq->prod_pt = polltask_new(pq, pq->prod_fd, POLLIN, prod_fn, pq);
-+    if (!pq->prod_pt)
-+        goto fail2;
-+    pollqueue_add_task(pq->prod_pt, -1);
-+    if (pthread_create(&pq->worker, NULL, poll_thread, pq))
-+        goto fail3;
-+    // Reset ref count which will have been inced by the add_task
-+    atomic_store(&pq->ref_count, 0);
-+    return pq;
-+
-+fail3:
-+    polltask_free(pq->prod_pt);
-+fail2:
-+    close(pq->prod_fd);
-+fail1:
-+    free(pq);
-+    return NULL;
-+}
-+
-+static void pollqueue_free(struct pollqueue *const pq)
-+{
-+    void *rv;
-+
-+    pthread_mutex_lock(&pq->lock);
-+    pq->kill = true;
-+    pollqueue_prod(pq);
-+    pthread_mutex_unlock(&pq->lock);
-+
-+    pthread_join(pq->worker, &rv);
-+    polltask_free(pq->prod_pt);
-+    pthread_mutex_destroy(&pq->lock);
-+    close(pq->prod_fd);
-+    free(pq);
-+}
-+
-+struct pollqueue * pollqueue_ref(struct pollqueue *const pq)
-+{
-+    atomic_fetch_add(&pq->ref_count, 1);
-+    return pq;
-+}
-+
-+void pollqueue_unref(struct pollqueue **const ppq)
-+{
-+    struct pollqueue * const pq = *ppq;
-+
-+    if (!pq)
-+        return;
-+    *ppq = NULL;
-+
-+    if (atomic_fetch_sub(&pq->ref_count, 1) != 0)
-+        return;
-+
-+    pollqueue_free(pq);
-+}
-+
-+
-+
-diff --git a/libavcodec/v4l2_req_pollqueue.h b/libavcodec/v4l2_req_pollqueue.h
-new file mode 100644
-index 0000000000..e1182cb2fc
---- /dev/null
-+++ b/libavcodec/v4l2_req_pollqueue.h
-@@ -0,0 +1,18 @@
-+#ifndef POLLQUEUE_H_
-+#define POLLQUEUE_H_
-+
-+struct polltask;
-+struct pollqueue;
-+
-+struct polltask *polltask_new(struct pollqueue *const pq,
-+			      const int fd, const short events,
-+			      void (*const fn)(void *v, short revents),
-+			      void *const v);
-+void polltask_delete(struct polltask **const ppt);
-+
-+void pollqueue_add_task(struct polltask *const pt, const int timeout);
-+struct pollqueue * pollqueue_new(void);
-+void pollqueue_unref(struct pollqueue **const ppq);
-+struct pollqueue * pollqueue_ref(struct pollqueue *const pq);
-+
-+#endif /* POLLQUEUE_H_ */
-diff --git a/libavcodec/v4l2_req_utils.h b/libavcodec/v4l2_req_utils.h
-new file mode 100644
-index 0000000000..a31cc1f4ec
---- /dev/null
-+++ b/libavcodec/v4l2_req_utils.h
-@@ -0,0 +1,27 @@
-+#ifndef AVCODEC_V4L2_REQ_UTILS_H
-+#define AVCODEC_V4L2_REQ_UTILS_H
-+
-+#include <stdint.h>
-+#include "libavutil/log.h"
-+
-+#define request_log(...) av_log(NULL, AV_LOG_INFO, __VA_ARGS__)
-+
-+#define request_err(_ctx, ...) av_log(_ctx, AV_LOG_ERROR, __VA_ARGS__)
-+#define request_warn(_ctx, ...) av_log(_ctx, AV_LOG_WARNING, __VA_ARGS__)
-+#define request_info(_ctx, ...) av_log(_ctx, AV_LOG_INFO, __VA_ARGS__)
-+#define request_debug(_ctx, ...) av_log(_ctx, AV_LOG_DEBUG, __VA_ARGS__)
-+
-+static inline char safechar(char c) {
-+    return c > 0x20 && c < 0x7f ? c : '.';
-+}
-+
-+static inline const char * strfourcc(char tbuf[5], uint32_t fcc) {
-+    tbuf[0] = safechar((fcc >>  0) & 0xff);
-+    tbuf[1] = safechar((fcc >>  8) & 0xff);
-+    tbuf[2] = safechar((fcc >> 16) & 0xff);
-+    tbuf[3] = safechar((fcc >> 24) & 0xff);
-+    tbuf[4] = '\0';
-+    return tbuf;
-+}
-+
-+#endif
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-new file mode 100644
-index 0000000000..b0a5930844
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -0,0 +1,297 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+
-+#include "decode.h"
-+#include "hevcdec.h"
-+#include "hwconfig.h"
-+#include "internal.h"
-+
-+#include "v4l2_request_hevc.h"
-+
-+#include "libavutil/hwcontext_drm.h"
-+
-+#include "v4l2_req_devscan.h"
-+#include "v4l2_req_dmabufs.h"
-+#include "v4l2_req_pollqueue.h"
-+#include "v4l2_req_media.h"
-+#include "v4l2_req_utils.h"
-+
-+static size_t bit_buf_size(unsigned int w, unsigned int h, unsigned int bits_minus8)
-+{
-+    const size_t wxh = w * h;
-+    size_t bits_alloc;
-+
-+    /* Annex A gives a min compression of 2 @ lvl 3.1
-+     * (wxh <= 983040) and min 4 thereafter but avoid
-+     * the odity of 983041 having a lower limit than
-+     * 983040.
-+     * Multiply by 3/2 for 4:2:0
-+     */
-+    bits_alloc = wxh < 983040 ? wxh * 3 / 4 :
-+        wxh < 983040 * 2 ? 983040 * 3 / 4 :
-+        wxh * 3 / 8;
-+    /* Allow for bit depth */
-+    bits_alloc += (bits_alloc * bits_minus8) / 8;
-+    /* Add a few bytes (16k) for overhead */
-+    bits_alloc += 0x4000;
-+    return bits_alloc;
-+}
-+
-+static int v4l2_req_hevc_start_frame(AVCodecContext *avctx,
-+                                     av_unused const uint8_t *buffer,
-+                                     av_unused uint32_t size)
-+{
-+    const V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->start_frame(avctx, buffer, size);
-+}
-+
-+static int v4l2_req_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->decode_slice(avctx, buffer, size);
-+}
-+
-+static int v4l2_req_hevc_end_frame(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC *ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->end_frame(avctx);
-+}
-+
-+static void v4l2_req_hevc_abort_frame(AVCodecContext * const avctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    ctx->fns->abort_frame(avctx);
-+}
-+
-+static int v4l2_req_hevc_frame_params(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->frame_params(avctx, hw_frames_ctx);
-+}
-+
-+static int v4l2_req_hevc_alloc_frame(AVCodecContext * avctx, AVFrame *frame)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    return ctx->fns->alloc_frame(avctx, frame);
-+}
-+
-+
-+static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
-+{
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    decode_q_wait(&ctx->decode_q, NULL);  // Wait for all other threads to be out of decode
-+
-+    mediabufs_ctl_unref(&ctx->mbufs);
-+    media_pool_delete(&ctx->mpool);
-+    pollqueue_unref(&ctx->pq);
-+    dmabufs_ctl_delete(&ctx->dbufs);
-+    devscan_delete(&ctx->devscan);
-+
-+    decode_q_uninit(&ctx->decode_q);
-+
-+//    if (avctx->hw_frames_ctx) {
-+//        AVHWFramesContext *hwfc = (AVHWFramesContext*)avctx->hw_frames_ctx->data;
-+//        av_buffer_pool_flush(hwfc->pool);
-+//    }
-+    return 0;
-+}
-+
-+static int dst_fmt_accept_cb(void * v, const struct v4l2_fmtdesc *fmtdesc)
-+{
-+    AVCodecContext *const avctx = v;
-+    const HEVCContext *const h = avctx->priv_data;
-+
-+    if (h->ps.sps->bit_depth == 8) {
-+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_COL128 ||
-+            fmtdesc->pixelformat == V4L2_PIX_FMT_NV12) {
-+            return 1;
-+        }
-+    }
-+    else if (h->ps.sps->bit_depth == 10) {
-+        if (fmtdesc->pixelformat == V4L2_PIX_FMT_NV12_10_COL128) {
-+            return 1;
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int v4l2_request_hevc_init(AVCodecContext *avctx)
-+{
-+    const HEVCContext *h = avctx->priv_data;
-+    V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
-+    const HEVCSPS * const sps = h->ps.sps;
-+    int ret;
-+    const struct decdev * decdev;
-+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
-+    size_t src_size;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
-+        av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
-+        return (AVERROR(-ret));
-+    }
-+    ret = AVERROR(ENOMEM);  // Assume mem fail by default for these
-+
-+    if ((decdev = devscan_find(ctx->devscan, src_pix_fmt)) == NULL)
-+    {
-+        av_log(avctx, AV_LOG_WARNING, "Failed to find a V4L2 device for H265\n");
-+        ret = AVERROR(ENODEV);
-+        goto fail0;
-+    }
-+    av_log(avctx, AV_LOG_DEBUG, "Trying V4L2 devices: %s,%s\n",
-+           decdev_media_path(decdev), decdev_video_path(decdev));
-+
-+    if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
-+        goto fail0;
-+    }
-+
-+    if ((ctx->pq = pollqueue_new()) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create pollqueue\n");
-+        goto fail1;
-+    }
-+
-+    if ((ctx->mpool = media_pool_new(decdev_media_path(decdev), ctx->pq, 4)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create media pool\n");
-+        goto fail2;
-+    }
-+
-+    if ((ctx->mbufs = mediabufs_ctl_new(avctx, decdev_video_path(decdev), ctx->pq)) == NULL) {
-+        av_log(avctx, AV_LOG_ERROR, "Unable to create media controls\n");
-+        goto fail3;
-+    }
-+
-+    // Ask for an initial bitbuf size of max size / 4
-+    // We will realloc if we need more
-+    // Must use sps->h/w as avctx contains cropped size
-+    src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
-+    if (mediabufs_src_resizable(ctx->mbufs))
-+        src_size /= 4;
-+    // Kludge for conformance tests which break Annex A limits
-+    else if (src_size < 0x40000)
-+        src_size = 0x40000;
-+
-+    if (mediabufs_src_fmt_set(ctx->mbufs, decdev_src_type(decdev), src_pix_fmt,
-+                              sps->width, sps->height, src_size)) {
-+        char tbuf1[5];
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set source format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
-+        goto fail4;
-+    }
-+
-+    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 2);
-+    }
-+    else if (V2(ff_v4l2_req_hevc, 1).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 1);
-+    }
-+    else {
-+        av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
-+        ret = AVERROR(EINVAL);
-+        goto fail4;
-+    }
-+
-+    if (mediabufs_dst_fmt_set(ctx->mbufs, sps->width, sps->height, dst_fmt_accept_cb, avctx)) {
-+        char tbuf1[5];
-+        av_log(avctx, AV_LOG_ERROR, "Failed to set destination format: %s %dx%d\n", strfourcc(tbuf1, src_pix_fmt), sps->width, sps->height);
-+        goto fail4;
-+    }
-+
-+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
-+        goto fail4;
-+    }
-+
-+    {
-+        unsigned int dst_slots = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering +
-+            avctx->thread_count + (avctx->extra_hw_frames > 0 ? avctx->extra_hw_frames : 6);
-+        av_log(avctx, AV_LOG_DEBUG, "Slots=%d: Reordering=%d, threads=%d, hw+=%d\n", dst_slots,
-+               sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
-+               avctx->thread_count, avctx->extra_hw_frames);
-+
-+        // extra_hw_frames is -1 if unset
-+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
-+            goto fail4;
-+        }
-+    }
-+
-+    if (mediabufs_stream_on(ctx->mbufs)) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed stream on\n");
-+        goto fail4;
-+    }
-+
-+    if ((ret = ff_decode_get_hw_frames_ctx(avctx, AV_HWDEVICE_TYPE_DRM)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create frame ctx\n");
-+        goto fail4;
-+    }
-+
-+    if ((ret = ctx->fns->set_controls(avctx, ctx)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed set controls\n");
-+        goto fail5;
-+    }
-+
-+    decode_q_init(&ctx->decode_q);
-+
-+    // Set our s/w format
-+    avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
-+
-+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
-+           ctx->fns->name,
-+           decdev_media_path(decdev), decdev_video_path(decdev));
-+
-+    return 0;
-+
-+fail5:
-+    av_buffer_unref(&avctx->hw_frames_ctx);
-+fail4:
-+    mediabufs_ctl_unref(&ctx->mbufs);
-+fail3:
-+    media_pool_delete(&ctx->mpool);
-+fail2:
-+    pollqueue_unref(&ctx->pq);
-+fail1:
-+    dmabufs_ctl_delete(&ctx->dbufs);
-+fail0:
-+    devscan_delete(&ctx->devscan);
-+    return ret;
-+}
-+
-+const AVHWAccel ff_hevc_v4l2request_hwaccel = {
-+    .name           = "hevc_v4l2request",
-+    .type           = AVMEDIA_TYPE_VIDEO,
-+    .id             = AV_CODEC_ID_HEVC,
-+    .pix_fmt        = AV_PIX_FMT_DRM_PRIME,
-+    .alloc_frame    = v4l2_req_hevc_alloc_frame,
-+    .start_frame    = v4l2_req_hevc_start_frame,
-+    .decode_slice   = v4l2_req_hevc_decode_slice,
-+    .end_frame      = v4l2_req_hevc_end_frame,
-+    .abort_frame    = v4l2_req_hevc_abort_frame,
-+    .init           = v4l2_request_hevc_init,
-+    .uninit         = v4l2_request_hevc_uninit,
-+    .priv_data_size = sizeof(V4L2RequestContextHEVC),
-+    .frame_params   = v4l2_req_hevc_frame_params,
-+    .caps_internal  = HWACCEL_CAP_ASYNC_SAFE | HWACCEL_CAP_MT_SAFE,
-+};
-diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
-new file mode 100644
-index 0000000000..f14f594564
---- /dev/null
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -0,0 +1,102 @@
-+#ifndef AVCODEC_V4L2_REQUEST_HEVC_H
-+#define AVCODEC_V4L2_REQUEST_HEVC_H
-+
-+#include <stdint.h>
-+#include <drm_fourcc.h>
-+#include "v4l2_req_decode_q.h"
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
-+// in the future but until then...
-+#ifndef DRM_FORMAT_P030
-+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+#include <linux/videodev2.h>
-+#ifndef V4L2_CID_CODEC_BASE
-+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
-+#endif
-+
-+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
-+// in drm_fourcc.h hopefully will be sometime in the future but until then...
-+#ifndef V4L2_PIX_FMT_NV12_10_COL128
-+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
-+#endif
-+
-+#ifndef V4L2_PIX_FMT_NV12_COL128
-+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
-+#endif
-+
-+#ifndef V4L2_CTRL_FLAG_DYNAMIC_ARRAY
-+#define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
-+#endif
-+
-+#define MAX_SLICES 128
-+
-+#define VCAT(name, version) name##_v##version
-+#define V2(n,v) VCAT(n, v)
-+#define V(n) V2(n, HEVC_CTRLS_VERSION)
-+
-+#define S2(x) #x
-+#define STR(x) S2(x)
-+
-+// 1 per decoder
-+struct v4l2_req_decode_fns;
-+
-+typedef struct V4L2RequestContextHEVC {
-+//    V4L2RequestContext base;
-+    const struct v4l2_req_decode_fns * fns;
-+
-+    unsigned int timestamp;  // ?? maybe uint64_t
-+
-+    int multi_slice;
-+    int decode_mode;
-+    int start_code;
-+    int max_slices;
-+
-+    req_decode_q decode_q;
-+
-+    struct devscan *devscan;
-+    struct dmabufs_ctl *dbufs;
-+    struct pollqueue *pq;
-+    struct media_pool * mpool;
-+    struct mediabufs_ctl *mbufs;
-+} V4L2RequestContextHEVC;
-+
-+typedef struct v4l2_req_decode_fns {
-+    int src_pix_fmt_v4l2;
-+    const char * name;
-+
-+    // Init setup
-+    int (*probe)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
-+    int (*set_controls)(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx);
-+
-+    // Passthrough of hwaccel fns
-+    int (*start_frame)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
-+    int (*decode_slice)(AVCodecContext *avctx, const uint8_t *buf, uint32_t buf_size);
-+    int (*end_frame)(AVCodecContext *avctx);
-+    void (*abort_frame)(AVCodecContext *avctx);
-+    int (*frame_params)(AVCodecContext *avctx, AVBufferRef *hw_frames_ctx);
-+    int (*alloc_frame)(AVCodecContext * avctx, AVFrame *frame);
-+} v4l2_req_decode_fns;
-+
-+
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
-+
-+#endif
-
-From c99a0fe4d59212079de9bed222114abf95f7c989 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 27 Apr 2021 19:30:36 +0100
-Subject: [PATCH 013/136] Add no_cvt_hw option to ffmpeg
-
----
- fftools/ffmpeg.c     | 6 ++++--
- fftools/ffmpeg.h     | 2 ++
- fftools/ffmpeg_opt.c | 3 +++
- 3 files changed, 9 insertions(+), 2 deletions(-)
-
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 15e084f0b2..5dc2cd73c1 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -2005,6 +2005,9 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame, int keep_ref
-         (ifilter->hw_frames_ctx && ifilter->hw_frames_ctx->data != frame->hw_frames_ctx->data))
-         need_reinit = 1;
- 
-+    if (no_cvt_hw && fg->graph)
-+        need_reinit = 0;
-+
-     if (sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DISPLAYMATRIX)) {
-         if (!ifilter->displaymatrix || memcmp(sd->data, ifilter->displaymatrix, sizeof(int32_t) * 9))
-             need_reinit = 1;
-@@ -2274,8 +2277,7 @@ static int decode_video(InputStream *ist, AVPacket *pkt, int *got_output, int64_
-         decoded_frame->top_field_first = ist->top_field_first;
- 
-     ist->frames_decoded++;
--
--    if (ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
-+    if (!no_cvt_hw && ist->hwaccel_retrieve_data && decoded_frame->format == ist->hwaccel_pix_fmt) {
-         err = ist->hwaccel_retrieve_data(ist->dec_ctx, decoded_frame);
-         if (err < 0)
-             goto fail;
-diff --git a/fftools/ffmpeg.h b/fftools/ffmpeg.h
-index f1412f6446..8f478619b3 100644
---- a/fftools/ffmpeg.h
-+++ b/fftools/ffmpeg.h
-@@ -729,6 +729,8 @@ extern enum VideoSyncMethod video_sync_method;
- extern float frame_drop_threshold;
- extern int do_benchmark;
- extern int do_benchmark_all;
-+extern int no_cvt_hw;
-+extern int do_deinterlace;
- extern int do_hex_dump;
- extern int do_pkt_dump;
- extern int copy_ts;
-diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
-index 055275d813..761db36588 100644
---- a/fftools/ffmpeg_opt.c
-+++ b/fftools/ffmpeg_opt.c
-@@ -71,6 +71,7 @@ enum VideoSyncMethod video_sync_method = VSYNC_AUTO;
- float frame_drop_threshold = 0;
- int do_benchmark      = 0;
- int do_benchmark_all  = 0;
-+int no_cvt_hw         = 0;
- int do_hex_dump       = 0;
- int do_pkt_dump       = 0;
- int copy_ts           = 0;
-@@ -1427,6 +1428,8 @@ const OptionDef options[] = {
-         "add timings for benchmarking" },
-     { "benchmark_all",  OPT_BOOL | OPT_EXPERT,                       { &do_benchmark_all },
-       "add timings for each task" },
-+    { "no_cvt_hw",      OPT_BOOL | OPT_EXPERT,                       { &no_cvt_hw },
-+      "do not auto-convert hw frames to sw" },
-     { "progress",       HAS_ARG | OPT_EXPERT,                        { .func_arg = opt_progress },
-       "write program-readable progress information", "url" },
-     { "stdin",          OPT_BOOL | OPT_EXPERT,                       { &stdin_interaction },
-
-From 27e0c78a2df53fb2337bee4c383cdb58cbbc717e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 28 Apr 2021 10:16:39 +0100
-Subject: [PATCH 014/136] Add vout_drm
-
----
- configure                |   4 +
- libavdevice/Makefile     |   1 +
- libavdevice/alldevices.c |   1 +
- libavdevice/drm_vout.c   | 638 +++++++++++++++++++++++++++++++++++++++
- 4 files changed, 644 insertions(+)
- create mode 100644 libavdevice/drm_vout.c
-
-diff --git a/configure b/configure
-index 199aa2b3d5..49744cab19 100755
---- a/configure
-+++ b/configure
-@@ -346,6 +346,7 @@ External library support:
-   --enable-libnpp          enable Nvidia Performance Primitives-based code [no]
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-   --enable-sand            enable sand video formats [rpi]
-+  --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-@@ -1940,6 +1941,7 @@ FEATURE_LIST="
-     small
-     static
-     swscale_alpha
-+    vout_drm
- "
- 
- # this list should be kept in linking order
-@@ -3559,8 +3561,10 @@ sndio_indev_deps="sndio"
- sndio_outdev_deps="sndio"
- v4l2_indev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_indev_suggest="libv4l2"
-+v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
-+vout_drm_outdev_deps="libdrm vout_drm"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
- xcbgrab_indev_deps="libxcb"
- xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
-diff --git a/libavdevice/Makefile b/libavdevice/Makefile
-index 8a62822b69..36aac30186 100644
---- a/libavdevice/Makefile
-+++ b/libavdevice/Makefile
-@@ -48,6 +48,7 @@ OBJS-$(CONFIG_SNDIO_OUTDEV)              += sndio_enc.o sndio.o
- OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
- OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
- OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
-+OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
- OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
- OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
- 
-diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
-index 8a90fcb5d7..e2a8669f27 100644
---- a/libavdevice/alldevices.c
-+++ b/libavdevice/alldevices.c
-@@ -52,6 +52,7 @@ extern const FFOutputFormat ff_sndio_muxer;
- extern const AVInputFormat  ff_v4l2_demuxer;
- extern const FFOutputFormat ff_v4l2_muxer;
- extern const AVInputFormat  ff_vfwcap_demuxer;
-+extern const FFOutputFormat ff_vout_drm_muxer;
- extern const AVInputFormat  ff_xcbgrab_demuxer;
- extern const FFOutputFormat ff_xv_muxer;
- 
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-new file mode 100644
-index 0000000000..cfb33ce7c3
---- /dev/null
-+++ b/libavdevice/drm_vout.c
-@@ -0,0 +1,638 @@
-+/*
-+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+// *** This module is a work in progress and its utility is strictly
-+//     limited to testing.
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavformat/mux.h"
-+#include "avdevice.h"
-+
-+#include "pthread.h"
-+#include <semaphore.h>
-+#include <unistd.h>
-+
-+#include <xf86drm.h>
-+#include <xf86drmMode.h>
-+
-+#define TRACE_ALL 0
-+
-+#define DRM_MODULE "vc4"
-+
-+#define ERRSTR strerror(errno)
-+
-+struct drm_setup {
-+   int conId;
-+   uint32_t crtcId;
-+   int crtcIdx;
-+   uint32_t planeId;
-+   unsigned int out_fourcc;
-+   struct {
-+       int x, y, width, height;
-+   } compose;
-+};
-+
-+typedef struct drm_aux_s {
-+    unsigned int fb_handle;
-+    uint32_t bo_handles[AV_DRM_MAX_PLANES];
-+    AVFrame * frame;
-+} drm_aux_t;
-+
-+// Aux size should only need to be 2, but on a few streams (Hobbit) under FKMS
-+// we get initial flicker probably due to dodgy drm timing
-+#define AUX_SIZE 3
-+typedef struct drm_display_env_s
-+{
-+    AVClass *class;
-+
-+    int drm_fd;
-+    uint32_t con_id;
-+    struct drm_setup setup;
-+    enum AVPixelFormat avfmt;
-+    int show_all;
-+
-+    unsigned int ano;
-+    drm_aux_t aux[AUX_SIZE];
-+
-+    pthread_t q_thread;
-+    sem_t q_sem_in;
-+    sem_t q_sem_out;
-+    int q_terminate;
-+    AVFrame * q_next;
-+
-+} drm_display_env_t;
-+
-+
-+static int drm_vout_write_trailer(AVFormatContext *s)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_header(AVFormatContext *s)
-+{
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+static int find_plane(struct AVFormatContext * const avctx,
-+                      const int drmfd, const int crtcidx, const uint32_t format,
-+                      uint32_t * const pplane_id)
-+{
-+   drmModePlaneResPtr planes;
-+   drmModePlanePtr plane;
-+   unsigned int i;
-+   unsigned int j;
-+   int ret = 0;
-+
-+   planes = drmModeGetPlaneResources(drmfd);
-+   if (!planes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetPlaneResources failed: %s\n", ERRSTR);
-+       return -1;
-+   }
-+
-+   for (i = 0; i < planes->count_planes; ++i) {
-+      plane = drmModeGetPlane(drmfd, planes->planes[i]);
-+      if (!planes)
-+      {
-+          av_log(avctx, AV_LOG_WARNING, "drmModeGetPlane failed: %s\n", ERRSTR);
-+          break;
-+      }
-+
-+      if (!(plane->possible_crtcs & (1 << crtcidx))) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      for (j = 0; j < plane->count_formats; ++j) {
-+         if (plane->formats[j] == format)
-+            break;
-+      }
-+
-+      if (j == plane->count_formats) {
-+         drmModeFreePlane(plane);
-+         continue;
-+      }
-+
-+      *pplane_id = plane->plane_id;
-+      drmModeFreePlane(plane);
-+      break;
-+   }
-+
-+   if (i == planes->count_planes)
-+      ret = -1;
-+
-+   drmModeFreePlaneResources(planes);
-+   return ret;
-+}
-+
-+static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
-+{
-+    if (da->fb_handle != 0) {
-+        drmModeRmFB(de->drm_fd, da->fb_handle);
-+        da->fb_handle = 0;
-+    }
-+
-+    for (unsigned int i = 0; i != AV_DRM_MAX_PLANES; ++i) {
-+        if (da->bo_handles[i]) {
-+            struct drm_gem_close gem_close = {.handle = da->bo_handles[i]};
-+            drmIoctl(de->drm_fd, DRM_IOCTL_GEM_CLOSE, &gem_close);
-+            da->bo_handles[i] = 0;
-+        }
-+    }
-+    av_frame_free(&da->frame);
-+}
-+
-+static int do_display(AVFormatContext * const s, drm_display_env_t * const de, AVFrame * frame)
-+{
-+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-+    drm_aux_t * da = de->aux + de->ano;
-+    const uint32_t format = desc->layers[0].format;
-+    int ret = 0;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "<<< %s: fd=%d\n", __func__, desc->objects[0].fd);
-+#endif
-+
-+    if (de->setup.out_fourcc != format) {
-+        if (find_plane(s, de->drm_fd, de->setup.crtcIdx, format, &de->setup.planeId)) {
-+            av_frame_free(&frame);
-+            av_log(s, AV_LOG_WARNING, "No plane for format: %#x\n", format);
-+            return -1;
-+        }
-+        de->setup.out_fourcc = format;
-+    }
-+
-+    {
-+        drmVBlank vbl = {
-+            .request = {
-+                .type = DRM_VBLANK_RELATIVE,
-+                .sequence = 0
-+            }
-+        };
-+
-+        while (drmWaitVBlank(de->drm_fd, &vbl)) {
-+            if (errno != EINTR) {
-+//                av_log(s, AV_LOG_WARNING, "drmWaitVBlank failed: %s\n", ERRSTR);
-+                break;
-+            }
-+        }
-+    }
-+
-+    da_uninit(de, da);
-+
-+    {
-+        uint32_t pitches[4] = {0};
-+        uint32_t offsets[4] = {0};
-+        uint64_t modifiers[4] = {0};
-+        uint32_t bo_handles[4] = {0};
-+        int i, j, n;
-+
-+        da->frame = frame;
-+
-+        for (i = 0; i < desc->nb_objects; ++i) {
-+            if (drmPrimeFDToHandle(de->drm_fd, desc->objects[i].fd, da->bo_handles + i) != 0) {
-+                av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
-+                return -1;
-+            }
-+        }
-+
-+        n = 0;
-+        for (i = 0; i < desc->nb_layers; ++i) {
-+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                pitches[n] = p->pitch;
-+                offsets[n] = p->offset;
-+                modifiers[n] = obj->format_modifier;
-+                bo_handles[n] = da->bo_handles[p->object_index];
-+                ++n;
-+            }
-+        }
-+
-+#if 1 && TRACE_ALL
-+        av_log(s, AV_LOG_DEBUG, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-+               av_frame_cropped_width(frame),
-+               av_frame_cropped_height(frame),
-+               desc->layers[0].format,
-+               bo_handles[0],
-+               bo_handles[1],
-+               bo_handles[2],
-+               bo_handles[3],
-+               pitches[0],
-+               pitches[1],
-+               pitches[2],
-+               pitches[3],
-+               offsets[0],
-+               offsets[1],
-+               offsets[2],
-+               offsets[3],
-+               (long long)modifiers[0],
-+               (long long)modifiers[1],
-+               (long long)modifiers[2],
-+               (long long)modifiers[3]
-+               );
-+#endif
-+
-+        if (drmModeAddFB2WithModifiers(de->drm_fd,
-+                                         av_frame_cropped_width(frame),
-+                                         av_frame_cropped_height(frame),
-+                                         desc->layers[0].format, bo_handles,
-+                                         pitches, offsets, modifiers,
-+                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
-+            av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
-+            return -1;
-+        }
-+    }
-+
-+    ret = drmModeSetPlane(de->drm_fd, de->setup.planeId, de->setup.crtcId,
-+                              da->fb_handle, 0,
-+                de->setup.compose.x, de->setup.compose.y,
-+                de->setup.compose.width,
-+                de->setup.compose.height,
-+                0, 0,
-+                av_frame_cropped_width(frame) << 16,
-+                av_frame_cropped_height(frame) << 16);
-+
-+    if (ret != 0) {
-+        av_log(s, AV_LOG_WARNING, "drmModeSetPlane failed: %s\n", ERRSTR);
-+    }
-+
-+    de->ano = de->ano + 1 >= AUX_SIZE ? 0 : de->ano + 1;
-+
-+    return ret;
-+}
-+
-+static int do_sem_wait(sem_t * const sem, const int nowait)
-+{
-+    while (nowait ? sem_trywait(sem) : sem_wait(sem)) {
-+        if (errno != EINTR)
-+            return -errno;
-+    }
-+    return 0;
-+}
-+
-+static void * display_thread(void * v)
-+{
-+    AVFormatContext * const s = v;
-+    drm_display_env_t * const de = s->priv_data;
-+    int i;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+#endif
-+
-+    sem_post(&de->q_sem_out);
-+
-+    for (;;) {
-+        AVFrame * frame;
-+
-+        do_sem_wait(&de->q_sem_in, 0);
-+
-+        if (de->q_terminate)
-+            break;
-+
-+        frame = de->q_next;
-+        de->q_next = NULL;
-+        sem_post(&de->q_sem_out);
-+
-+        do_display(s, de, frame);
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+#endif
-+
-+    for (i = 0; i != AUX_SIZE; ++i)
-+        da_uninit(de, de->aux + i);
-+
-+    av_frame_free(&de->q_next);
-+
-+    return NULL;
-+}
-+
-+static int drm_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-+    AVFrame * frame;
-+    drm_display_env_t * const de = s->priv_data;
-+    int ret;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s\n", __func__);
-+#endif
-+
-+    if ((src_frame->flags & AV_FRAME_FLAG_CORRUPT) != 0) {
-+        av_log(s, AV_LOG_WARNING, "Discard corrupt frame: fmt=%d, ts=%" PRId64 "\n", src_frame->format, src_frame->pts);
-+        return 0;
-+    }
-+
-+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
-+        frame = av_frame_alloc();
-+        av_frame_ref(frame, src_frame);
-+    }
-+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
-+        frame = av_frame_alloc();
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        if (av_hwframe_map(frame, src_frame, 0) != 0)
-+        {
-+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
-+            av_frame_free(&frame);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+    else {
-+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    ret = do_sem_wait(&de->q_sem_out, !de->show_all);
-+    if (ret) {
-+        av_frame_free(&frame);
-+    }
-+    else {
-+        de->q_next = frame;
-+        sem_post(&de->q_sem_in);
-+    }
-+
-+    return 0;
-+}
-+
-+static int drm_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+    return AVERROR_PATCHWELCOME;
-+}
-+
-+static int drm_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_DEBUG, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+static int find_crtc(struct AVFormatContext * const avctx, int drmfd, struct drm_setup *s, uint32_t * const pConId)
-+{
-+   int ret = -1;
-+   int i;
-+   drmModeRes *res = drmModeGetResources(drmfd);
-+   drmModeConnector *c;
-+
-+   if(!res)
-+   {
-+      printf( "drmModeGetResources failed: %s\n", ERRSTR);
-+      return -1;
-+   }
-+
-+   if (res->count_crtcs <= 0)
-+   {
-+      printf( "drm: no crts\n");
-+      goto fail_res;
-+   }
-+
-+   if (!s->conId) {
-+      fprintf(stderr,
-+         "No connector ID specified.  Choosing default from list:\n");
-+
-+      for (i = 0; i < res->count_connectors; i++) {
-+         drmModeConnector *con =
-+            drmModeGetConnector(drmfd, res->connectors[i]);
-+         drmModeEncoder *enc = NULL;
-+         drmModeCrtc *crtc = NULL;
-+
-+         if (con->encoder_id) {
-+            enc = drmModeGetEncoder(drmfd, con->encoder_id);
-+            if (enc->crtc_id) {
-+               crtc = drmModeGetCrtc(drmfd, enc->crtc_id);
-+            }
-+         }
-+
-+         if (!s->conId && crtc) {
-+            s->conId = con->connector_id;
-+            s->crtcId = crtc->crtc_id;
-+         }
-+
-+         av_log(avctx, AV_LOG_DEBUG, "Connector %d (crtc %d): type %d, %dx%d%s\n",
-+                con->connector_id,
-+                crtc ? crtc->crtc_id : 0,
-+                con->connector_type,
-+                crtc ? crtc->width : 0,
-+                crtc ? crtc->height : 0,
-+                (s->conId == (int)con->connector_id ?
-+            " (chosen)" : ""));
-+      }
-+
-+      if (!s->conId) {
-+         av_log(avctx, AV_LOG_ERROR,
-+            "No suitable enabled connector found.\n");
-+         return -1;;
-+      }
-+   }
-+
-+   s->crtcIdx = -1;
-+
-+   for (i = 0; i < res->count_crtcs; ++i) {
-+      if (s->crtcId == res->crtcs[i]) {
-+         s->crtcIdx = i;
-+         break;
-+      }
-+   }
-+
-+   if (s->crtcIdx == -1)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: CRTC %u not found\n", s->crtcId);
-+       goto fail_res;
-+   }
-+
-+   if (res->count_connectors <= 0)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drm: no connectors\n");
-+       goto fail_res;
-+   }
-+
-+   c = drmModeGetConnector(drmfd, s->conId);
-+   if (!c)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "drmModeGetConnector failed: %s\n", ERRSTR);
-+       goto fail_res;
-+   }
-+
-+   if (!c->count_modes)
-+   {
-+       av_log(avctx, AV_LOG_WARNING, "connector supports no mode\n");
-+       goto fail_conn;
-+   }
-+
-+   {
-+      drmModeCrtc *crtc = drmModeGetCrtc(drmfd, s->crtcId);
-+      s->compose.x = crtc->x;
-+      s->compose.y = crtc->y;
-+      s->compose.width = crtc->width;
-+      s->compose.height = crtc->height;
-+      drmModeFreeCrtc(crtc);
-+   }
-+
-+   if (pConId)
-+      *pConId = c->connector_id;
-+   ret = 0;
-+
-+fail_conn:
-+   drmModeFreeConnector(c);
-+
-+fail_res:
-+   drmModeFreeResources(res);
-+
-+   return ret;
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int drm_vout_init(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+    int rv;
-+    const char * drm_module = DRM_MODULE;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->drm_fd = -1;
-+    de->con_id = 0;
-+    de->setup = (struct drm_setup){0};
-+    de->q_terminate = 0;
-+
-+    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
-+    {
-+        rv = AVERROR(errno);
-+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
-+        return rv;
-+    }
-+
-+    if (find_crtc(s, de->drm_fd, &de->setup, &de->con_id) != 0)
-+    {
-+        av_log(s, AV_LOG_ERROR, "failed to find valid mode\n");
-+        rv = AVERROR(EINVAL);
-+        goto fail_close;
-+    }
-+
-+    sem_init(&de->q_sem_in, 0, 0);
-+    sem_init(&de->q_sem_out, 0, 0);
-+    if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
-+        rv = AVERROR(errno);
-+        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
-+        goto fail_close;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+
-+    return 0;
-+
-+fail_close:
-+    close(de->drm_fd);
-+    de->drm_fd = -1;
-+    av_log(s, AV_LOG_DEBUG, ">>> %s: FAIL\n", __func__);
-+
-+    return rv;
-+}
-+
-+static void drm_vout_deinit(struct AVFormatContext * s)
-+{
-+    drm_display_env_t * const de = s->priv_data;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->q_terminate = 1;
-+    sem_post(&de->q_sem_in);
-+    pthread_join(de->q_thread, NULL);
-+    sem_destroy(&de->q_sem_in);
-+    sem_destroy(&de->q_sem_out);
-+
-+    for (unsigned int i = 0; i != AUX_SIZE; ++i)
-+        da_uninit(de, de->aux + i);
-+
-+    av_frame_free(&de->q_next);
-+
-+    if (de->drm_fd >= 0) {
-+        close(de->drm_fd);
-+        de->drm_fd = -1;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+}
-+
-+
-+#define OFFSET(x) offsetof(drm_display_env_t, x)
-+static const AVOption options[] = {
-+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+};
-+
-+static const AVClass drm_vout_class = {
-+    .class_name = "drm vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+FFOutputFormat ff_vout_drm_muxer = {
-+    .p = {
-+        .name           = "vout_drm",
-+        .long_name      = NULL_IF_CONFIG_SMALL("Drm video output device"),
-+        .audio_codec    = AV_CODEC_ID_NONE,
-+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+        .priv_class     = &drm_vout_class,
-+    },
-+    .priv_data_size = sizeof(drm_display_env_t),
-+    .write_header   = drm_vout_write_header,
-+    .write_packet   = drm_vout_write_packet,
-+    .write_uncoded_frame = drm_vout_write_frame,
-+    .write_trailer  = drm_vout_write_trailer,
-+    .control_message = drm_vout_control_message,
-+    .init           = drm_vout_init,
-+    .deinit         = drm_vout_deinit,
-+};
-+
-
-From cc536672adf4eefeaec16e9808f583c693ad7819 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 28 Apr 2021 11:34:18 +0100
-Subject: [PATCH 015/136] Add vout_egl
-
----
- configure                |   6 +
- libavdevice/Makefile     |   1 +
- libavdevice/alldevices.c |   1 +
- libavdevice/egl_vout.c   | 811 +++++++++++++++++++++++++++++++++++++++
- 4 files changed, 819 insertions(+)
- create mode 100644 libavdevice/egl_vout.c
-
-diff --git a/configure b/configure
-index 49744cab19..b41663c794 100755
---- a/configure
-+++ b/configure
-@@ -347,6 +347,7 @@ External library support:
-   --enable-mmal            enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-   --enable-sand            enable sand video formats [rpi]
-   --enable-vout-drm        enable the vout_drm module - for internal testing only [no]
-+  --enable-vout-egl        enable the vout_egl module - for internal testing only [no]
-   --disable-nvdec          disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
-   --disable-nvenc          disable Nvidia video encoding code [autodetect]
-   --enable-omx             enable OpenMAX IL code [no]
-@@ -1818,6 +1819,7 @@ EXTERNAL_LIBRARY_LIST="
-     libdav1d
-     libdc1394
-     libdrm
-+    epoxy
-     libflite
-     libfontconfig
-     libfreetype
-@@ -1942,6 +1944,7 @@ FEATURE_LIST="
-     static
-     swscale_alpha
-     vout_drm
-+    vout_egl
- "
- 
- # this list should be kept in linking order
-@@ -3565,6 +3568,8 @@ v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
- vout_drm_outdev_deps="libdrm vout_drm"
-+vout_egl_outdev_deps="xlib"
-+vout_egl_outdev_select="epoxy"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
- xcbgrab_indev_deps="libxcb"
- xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
-@@ -6596,6 +6601,7 @@ enabled libdav1d          && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d
- enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
- enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
- enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
-+enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
- enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
-                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
-                                  warn "using libfdk without pkg-config"; } }
-diff --git a/libavdevice/Makefile b/libavdevice/Makefile
-index 36aac30186..0989cb895f 100644
---- a/libavdevice/Makefile
-+++ b/libavdevice/Makefile
-@@ -49,6 +49,7 @@ OBJS-$(CONFIG_V4L2_INDEV)                += v4l2.o v4l2-common.o timefilter.o
- OBJS-$(CONFIG_V4L2_OUTDEV)               += v4l2enc.o v4l2-common.o
- OBJS-$(CONFIG_VFWCAP_INDEV)              += vfwcap.o
- OBJS-$(CONFIG_VOUT_DRM_OUTDEV)           += drm_vout.o
-+OBJS-$(CONFIG_VOUT_EGL_OUTDEV)           += egl_vout.o
- OBJS-$(CONFIG_XCBGRAB_INDEV)             += xcbgrab.o
- OBJS-$(CONFIG_XV_OUTDEV)                 += xv.o
- 
-diff --git a/libavdevice/alldevices.c b/libavdevice/alldevices.c
-index e2a8669f27..ffb410b92d 100644
---- a/libavdevice/alldevices.c
-+++ b/libavdevice/alldevices.c
-@@ -53,6 +53,7 @@ extern const AVInputFormat  ff_v4l2_demuxer;
- extern const FFOutputFormat ff_v4l2_muxer;
- extern const AVInputFormat  ff_vfwcap_demuxer;
- extern const FFOutputFormat ff_vout_drm_muxer;
-+extern const FFOutputFormat ff_vout_egl_muxer;
- extern const AVInputFormat  ff_xcbgrab_demuxer;
- extern const FFOutputFormat ff_xv_muxer;
- 
-diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
-new file mode 100644
-index 0000000000..7b9c610ace
---- /dev/null
-+++ b/libavdevice/egl_vout.c
-@@ -0,0 +1,811 @@
-+/*
-+ * Copyright (c) 2020 John Cox for Raspberry Pi Trading
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+
-+// *** This module is a work in progress and its utility is strictly
-+//     limited to testing.
-+//     Amongst other issues it doesn't wait for the pic to be displayed before
-+//     returning the buffer so flikering does occur.
-+
-+#include <epoxy/gl.h>
-+#include <epoxy/egl.h>
-+
-+#include "libavutil/opt.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/hwcontext_drm.h"
-+#include "libavformat/mux.h"
-+#include "avdevice.h"
-+
-+#include "pthread.h"
-+#include <semaphore.h>
-+#include <stdatomic.h>
-+#include <unistd.h>
-+
-+#include <X11/Xlib.h>
-+#include <X11/Xutil.h>
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define TRACE_ALL 0
-+
-+struct egl_setup {
-+   int conId;
-+
-+   Display *dpy;
-+   EGLDisplay egl_dpy;
-+   EGLContext ctx;
-+   EGLSurface surf;
-+   Window win;
-+
-+   uint32_t crtcId;
-+   int crtcIdx;
-+   uint32_t planeId;
-+   struct {
-+       int x, y, width, height;
-+   } compose;
-+};
-+
-+typedef struct egl_aux_s {
-+    int fd;
-+    GLuint texture;
-+
-+} egl_aux_t;
-+
-+typedef struct egl_display_env_s
-+{
-+    AVClass *class;
-+
-+    struct egl_setup setup;
-+    enum AVPixelFormat avfmt;
-+
-+    int show_all;
-+    int window_width, window_height;
-+    int window_x, window_y;
-+    int fullscreen;
-+
-+    egl_aux_t aux[32];
-+
-+    pthread_t q_thread;
-+    pthread_mutex_t q_lock;
-+    sem_t display_start_sem;
-+    sem_t q_sem;
-+    int q_terminate;
-+    AVFrame * q_this;
-+    AVFrame * q_next;
-+
-+} egl_display_env_t;
-+
-+
-+/**
-+ * Remove window border/decorations.
-+ */
-+static void
-+no_border( Display *dpy, Window w)
-+{
-+   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
-+   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
-+
-+   typedef struct
-+   {
-+      unsigned long       flags;
-+      unsigned long       functions;
-+      unsigned long       decorations;
-+      long                inputMode;
-+      unsigned long       status;
-+   } PropMotifWmHints;
-+
-+   PropMotifWmHints motif_hints;
-+   Atom prop, proptype;
-+   unsigned long flags = 0;
-+
-+   /* setup the property */
-+   motif_hints.flags = MWM_HINTS_DECORATIONS;
-+   motif_hints.decorations = flags;
-+
-+   /* get the atom for the property */
-+   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
-+   if (!prop) {
-+      /* something went wrong! */
-+      return;
-+   }
-+
-+   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
-+   proptype = prop;
-+
-+   XChangeProperty( dpy, w,                         /* display, window */
-+                    prop, proptype,                 /* property, type */
-+                    32,                             /* format: 32-bit datums */
-+                    PropModeReplace,                /* mode */
-+                    (unsigned char *) &motif_hints, /* data */
-+                    PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
-+                  );
-+}
-+
-+
-+/*
-+ * Create an RGB, double-buffered window.
-+ * Return the window and context handles.
-+ */
-+static int
-+make_window(struct AVFormatContext * const s,
-+            egl_display_env_t * const de,
-+            Display *dpy, EGLDisplay egl_dpy, const char *name,
-+            Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
-+{
-+   int scrnum = DefaultScreen( dpy );
-+   XSetWindowAttributes attr;
-+   unsigned long mask;
-+   Window root = RootWindow( dpy, scrnum );
-+   Window win;
-+   EGLContext ctx;
-+   const int fullscreen = de->fullscreen;
-+   EGLConfig config;
-+   int x = de->window_x;
-+   int y = de->window_y;
-+   int width = de->window_width ? de->window_width : 1280;
-+   int height = de->window_height ? de->window_height : 720;
-+
-+
-+   if (fullscreen) {
-+      int scrnum = DefaultScreen(dpy);
-+
-+      x = 0; y = 0;
-+      width = DisplayWidth(dpy, scrnum);
-+      height = DisplayHeight(dpy, scrnum);
-+   }
-+
-+   {
-+      EGLint num_configs;
-+      static const EGLint attribs[] = {
-+         EGL_RED_SIZE, 1,
-+         EGL_GREEN_SIZE, 1,
-+         EGL_BLUE_SIZE, 1,
-+         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-+         EGL_NONE
-+      };
-+
-+      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
-+         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
-+         return -1;
-+      }
-+   }
-+
-+   {
-+      EGLint vid;
-+      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
-+         return -1;
-+      }
-+
-+      {
-+         XVisualInfo visTemplate = {
-+            .visualid = vid,
-+         };
-+         int num_visuals;
-+         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
-+                                               &visTemplate, &num_visuals);
-+
-+         /* window attributes */
-+         attr.background_pixel = 0;
-+         attr.border_pixel = 0;
-+         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
-+         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
-+         /* XXX this is a bad way to get a borderless window! */
-+         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
-+
-+         win = XCreateWindow( dpy, root, x, y, width, height,
-+                              0, visinfo->depth, InputOutput,
-+                              visinfo->visual, mask, &attr );
-+         XFree(visinfo);
-+      }
-+   }
-+
-+   if (fullscreen)
-+      no_border(dpy, win);
-+
-+   /* set hints and properties */
-+   {
-+      XSizeHints sizehints;
-+      sizehints.x = x;
-+      sizehints.y = y;
-+      sizehints.width  = width;
-+      sizehints.height = height;
-+      sizehints.flags = USSize | USPosition;
-+      XSetNormalHints(dpy, win, &sizehints);
-+      XSetStandardProperties(dpy, win, name, name,
-+                              None, (char **)NULL, 0, &sizehints);
-+   }
-+
-+   eglBindAPI(EGL_OPENGL_ES_API);
-+
-+   {
-+      static const EGLint ctx_attribs[] = {
-+         EGL_CONTEXT_CLIENT_VERSION, 2,
-+         EGL_NONE
-+      };
-+      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
-+      if (!ctx) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+         return -1;
-+      }
-+   }
-+
-+
-+   XMapWindow(dpy, win);
-+
-+   {
-+      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
-+      if (!surf) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
-+         return -1;
-+      }
-+
-+      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
-+         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+         return -1;
-+      }
-+
-+      *winRet = win;
-+      *ctxRet = ctx;
-+      *surfRet = surf;
-+   }
-+
-+   return 0;
-+}
-+
-+static GLint
-+compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
-+{
-+   GLuint s = glCreateShader(target);
-+
-+   if (s == 0) {
-+      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
-+      return 0;
-+   }
-+
-+   glShaderSource(s, 1, (const GLchar **) &source, NULL);
-+   glCompileShader(s);
-+
-+   {
-+      GLint ok;
-+      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
-+
-+      if (!ok) {
-+         GLchar *info;
-+         GLint size;
-+
-+         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
-+         info = malloc(size);
-+
-+         glGetShaderInfoLog(s, size, NULL, info);
-+         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
-+
-+         return 0;
-+      }
-+   }
-+
-+   return s;
-+}
-+
-+static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
-+{
-+   GLuint prog = glCreateProgram();
-+
-+   if (prog == 0) {
-+      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
-+      return 0;
-+   }
-+
-+   glAttachShader(prog, vs);
-+   glAttachShader(prog, fs);
-+   glLinkProgram(prog);
-+
-+   {
-+      GLint ok;
-+      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
-+      if (!ok) {
-+         /* Some drivers return a size of 1 for an empty log.  This is the size
-+          * of a log that contains only a terminating NUL character.
-+          */
-+         GLint size;
-+         GLchar *info = NULL;
-+         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
-+         if (size > 1) {
-+            info = malloc(size);
-+            glGetProgramInfoLog(prog, size, NULL, info);
-+         }
-+
-+         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
-+                 (info != NULL) ? info : "<empty log>");
-+         return 0;
-+      }
-+   }
-+
-+   return prog;
-+}
-+
-+static int
-+gl_setup(struct AVFormatContext * const s)
-+{
-+   const char *vs =
-+      "attribute vec4 pos;\n"
-+      "varying vec2 texcoord;\n"
-+      "\n"
-+      "void main() {\n"
-+      "  gl_Position = pos;\n"
-+      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
-+      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
-+      "}\n";
-+   const char *fs =
-+      "#extension GL_OES_EGL_image_external : enable\n"
-+      "precision mediump float;\n"
-+      "uniform samplerExternalOES s;\n"
-+      "varying vec2 texcoord;\n"
-+      "void main() {\n"
-+      "  gl_FragColor = texture2D(s, texcoord);\n"
-+      "}\n";
-+
-+   GLuint vs_s;
-+   GLuint fs_s;
-+   GLuint prog;
-+
-+   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
-+       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
-+       !(prog = link_program(s, vs_s, fs_s)))
-+      return -1;
-+
-+   glUseProgram(prog);
-+
-+   {
-+      static const float verts[] = {
-+         -1, -1,
-+         1, -1,
-+         1, 1,
-+         -1, 1,
-+      };
-+      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
-+   }
-+
-+   glEnableVertexAttribArray(0);
-+   return 0;
-+}
-+
-+static int egl_vout_write_trailer(AVFormatContext *s)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    return 0;
-+}
-+
-+static int egl_vout_write_header(AVFormatContext *s)
-+{
-+    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+    if (   s->nb_streams > 1
-+        || par->codec_type != AVMEDIA_TYPE_VIDEO
-+        || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-+        av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
-+}
-+
-+
-+static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
-+{
-+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
-+    egl_aux_t * da = NULL;
-+    unsigned int i;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
-+#endif
-+
-+    for (i = 0; i != 32; ++i) {
-+        if (de->aux[i].fd == -1 || de->aux[i].fd == desc->objects[0].fd) {
-+            da = de->aux + i;
-+            break;
-+        }
-+    }
-+
-+    if (da == NULL) {
-+        av_log(s, AV_LOG_INFO, "%s: Out of handles\n", __func__);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    if (da->texture == 0) {
-+        EGLint attribs[50];
-+        EGLint * a = attribs;
-+        int i, j;
-+        static const EGLint anames[] = {
-+           EGL_DMA_BUF_PLANE0_FD_EXT,
-+           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE0_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
-+           EGL_DMA_BUF_PLANE1_FD_EXT,
-+           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE1_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
-+           EGL_DMA_BUF_PLANE2_FD_EXT,
-+           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
-+           EGL_DMA_BUF_PLANE2_PITCH_EXT,
-+           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
-+           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
-+        };
-+        const EGLint * b = anames;
-+
-+        *a++ = EGL_WIDTH;
-+        *a++ = av_frame_cropped_width(frame);
-+        *a++ = EGL_HEIGHT;
-+        *a++ = av_frame_cropped_height(frame);
-+        *a++ = EGL_LINUX_DRM_FOURCC_EXT;
-+        *a++ = desc->layers[0].format;
-+
-+        for (i = 0; i < desc->nb_layers; ++i) {
-+            for (j = 0; j < desc->layers[i].nb_planes; ++j) {
-+                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
-+                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                *a++ = *b++;
-+                *a++ = obj->fd;
-+                *a++ = *b++;
-+                *a++ = p->offset;
-+                *a++ = *b++;
-+                *a++ = p->pitch;
-+                if (obj->format_modifier == 0) {
-+                   b += 2;
-+                }
-+                else {
-+                   *a++ = *b++;
-+                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
-+                   *a++ = *b++;
-+                   *a++ = (EGLint)(obj->format_modifier >> 32);
-+                }
-+            }
-+        }
-+
-+        *a = EGL_NONE;
-+
-+#if TRACE_ALL
-+        for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
-+           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
-+        }
-+#endif
-+        {
-+           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
-+                                              EGL_NO_CONTEXT,
-+                                              EGL_LINUX_DMA_BUF_EXT,
-+                                              NULL, attribs);
-+           if (!image) {
-+              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
-+              return -1;
-+           }
-+
-+           glGenTextures(1, &da->texture);
-+           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-+           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-+           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
-+
-+           eglDestroyImageKHR(de->setup.egl_dpy, image);
-+        }
-+
-+        da->fd = desc->objects[0].fd;
-+
-+#if 0
-+        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
-+               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
-+               av_frame_cropped_width(frame),
-+               av_frame_cropped_height(frame),
-+               desc->layers[0].format,
-+               bo_plane_handles[0],
-+               bo_plane_handles[1],
-+               bo_plane_handles[2],
-+               bo_plane_handles[3],
-+               pitches[0],
-+               pitches[1],
-+               pitches[2],
-+               pitches[3],
-+               offsets[0],
-+               offsets[1],
-+               offsets[2],
-+               offsets[3],
-+               (long long)modifiers[0],
-+               (long long)modifiers[1],
-+               (long long)modifiers[2],
-+               (long long)modifiers[3]
-+               );
-+#endif
-+    }
-+
-+    glClearColor(0.5, 0.5, 0.5, 0.5);
-+    glClear(GL_COLOR_BUFFER_BIT);
-+
-+    glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-+    glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
-+    eglSwapBuffers(de->setup.egl_dpy, de->setup.surf);
-+
-+    glDeleteTextures(1, &da->texture);
-+    da->texture = 0;
-+    da->fd = -1;
-+
-+    return 0;
-+}
-+
-+static void * display_thread(void * v)
-+{
-+    AVFormatContext * const s = v;
-+    egl_display_env_t * const de = s->priv_data;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
-+#endif
-+    {
-+       EGLint egl_major, egl_minor;
-+
-+       de->setup.dpy = XOpenDisplay(NULL);
-+       if (!de->setup.dpy) {
-+          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
-+          goto fail;
-+       }
-+
-+       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
-+       if (!de->setup.egl_dpy) {
-+          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
-+          goto fail;
-+       }
-+
-+       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
-+           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
-+           goto fail;
-+       }
-+
-+       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
-+
-+       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
-+          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
-+          goto fail;
-+       }
-+    }
-+
-+    if (!de->window_width || !de->window_height) {
-+       de->window_width = 1280;
-+       de->window_height = 720;
-+    }
-+    if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
-+                    &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
-+       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
-+       goto fail;
-+    }
-+
-+    if (gl_setup(s)) {
-+       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
-+       goto fail;
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "--- %s: Start done\n", __func__);
-+#endif
-+    sem_post(&de->display_start_sem);
-+
-+    for (;;) {
-+        AVFrame * frame;
-+
-+        while (sem_wait(&de->q_sem) != 0) {
-+            av_assert0(errno == EINTR);
-+        }
-+
-+        if (de->q_terminate)
-+            break;
-+
-+        pthread_mutex_lock(&de->q_lock);
-+        frame = de->q_next;
-+        de->q_next = NULL;
-+        pthread_mutex_unlock(&de->q_lock);
-+
-+        do_display(s, de, frame);
-+
-+        av_frame_free(&de->q_this);
-+        de->q_this = frame;
-+    }
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, ">>> %s\n", __func__);
-+#endif
-+
-+    return NULL;
-+
-+fail:
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, ">>> %s: FAIL\n", __func__);
-+#endif
-+    de->q_terminate = 1;
-+    sem_post(&de->display_start_sem);
-+
-+    return NULL;
-+}
-+
-+static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-+{
-+    const AVFrame * const src_frame = (AVFrame *)pkt->data;
-+    AVFrame * frame;
-+    egl_display_env_t * const de = s->priv_data;
-+
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s\n", __func__);
-+#endif
-+
-+    if (src_frame->format == AV_PIX_FMT_DRM_PRIME) {
-+        frame = av_frame_alloc();
-+        av_frame_ref(frame, src_frame);
-+    }
-+    else if (src_frame->format == AV_PIX_FMT_VAAPI) {
-+        frame = av_frame_alloc();
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        if (av_hwframe_map(frame, src_frame, 0) != 0)
-+        {
-+            av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
-+            av_frame_free(&frame);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+    else {
-+        av_log(s, AV_LOG_WARNING, "Frame (format=%d) not DRM_PRiME\n", src_frame->format);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    // Really hacky sync
-+    while (de->show_all && de->q_next) {
-+       usleep(3000);
-+    }
-+
-+    pthread_mutex_lock(&de->q_lock);
-+    {
-+        AVFrame * const t = de->q_next;
-+        de->q_next = frame;
-+        frame = t;
-+    }
-+    pthread_mutex_unlock(&de->q_lock);
-+
-+    if (frame == NULL)
-+        sem_post(&de->q_sem);
-+    else
-+        av_frame_free(&frame);
-+
-+    return 0;
-+}
-+
-+static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
-+                          unsigned flags)
-+{
-+    av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-+    return AVERROR_PATCHWELCOME;
-+}
-+
-+static int egl_vout_control_message(AVFormatContext *s, int type, void *data, size_t data_size)
-+{
-+#if TRACE_ALL
-+    av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
-+#endif
-+    switch(type) {
-+    case AV_APP_TO_DEV_WINDOW_REPAINT:
-+        return 0;
-+    default:
-+        break;
-+    }
-+    return AVERROR(ENOSYS);
-+}
-+
-+// deinit is called if init fails so no need to clean up explicity here
-+static int egl_vout_init(struct AVFormatContext * s)
-+{
-+    egl_display_env_t * const de = s->priv_data;
-+    unsigned int i;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->setup = (struct egl_setup){0};
-+
-+    for (i = 0; i != 32; ++i) {
-+        de->aux[i].fd = -1;
-+    }
-+
-+    de->q_terminate = 0;
-+    pthread_mutex_init(&de->q_lock, NULL);
-+    sem_init(&de->q_sem, 0, 0);
-+    sem_init(&de->display_start_sem, 0, 0);
-+    av_assert0(pthread_create(&de->q_thread, NULL, display_thread, s) == 0);
-+
-+    sem_wait(&de->display_start_sem);
-+    if (de->q_terminate) {
-+       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
-+       return -1;
-+    }
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+
-+    return 0;
-+}
-+
-+static void egl_vout_deinit(struct AVFormatContext * s)
-+{
-+    egl_display_env_t * const de = s->priv_data;
-+
-+    av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
-+
-+    de->q_terminate = 1;
-+    sem_post(&de->q_sem);
-+    pthread_join(de->q_thread, NULL);
-+    sem_destroy(&de->q_sem);
-+    pthread_mutex_destroy(&de->q_lock);
-+
-+    av_frame_free(&de->q_next);
-+    av_frame_free(&de->q_this);
-+
-+    av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-+}
-+
-+#define OFFSET(x) offsetof(egl_display_env_t, x)
-+static const AVOption options[] = {
-+   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { NULL }
-+
-+};
-+
-+static const AVClass egl_vout_class = {
-+    .class_name = "egl vid outdev",
-+    .item_name  = av_default_item_name,
-+    .option     = options,
-+    .version    = LIBAVUTIL_VERSION_INT,
-+    .category   = AV_CLASS_CATEGORY_DEVICE_VIDEO_OUTPUT,
-+};
-+
-+FFOutputFormat ff_vout_egl_muxer = {
-+    .p = {
-+        .name           = "vout_egl",
-+        .long_name      = NULL_IF_CONFIG_SMALL("Egl video output device"),
-+        .audio_codec    = AV_CODEC_ID_NONE,
-+        .video_codec    = AV_CODEC_ID_WRAPPED_AVFRAME,
-+        .flags          = AVFMT_NOFILE | AVFMT_VARIABLE_FPS | AVFMT_NOTIMESTAMPS,
-+        .priv_class     = &egl_vout_class,
-+    },
-+    .priv_data_size = sizeof(egl_display_env_t),
-+    .write_header   = egl_vout_write_header,
-+    .write_packet   = egl_vout_write_packet,
-+    .write_uncoded_frame = egl_vout_write_frame,
-+    .write_trailer  = egl_vout_write_trailer,
-+    .control_message = egl_vout_control_message,
-+    .init           = egl_vout_init,
-+    .deinit         = egl_vout_deinit,
-+};
-+
-
-From 867bd7c243e66a1c1756878e20df8f35db8025ec Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 28 Apr 2021 12:51:22 +0100
-Subject: [PATCH 016/136] V4L2 stateful rework
-
----
- libavcodec/Makefile       |   3 +-
- libavcodec/v4l2_buffers.c | 556 +++++++++++++++++++++++++++-----------
- libavcodec/v4l2_buffers.h |  28 +-
- libavcodec/v4l2_context.c | 536 +++++++++++++++++++++++++++---------
- libavcodec/v4l2_context.h |  20 +-
- libavcodec/v4l2_m2m.c     |  20 +-
- libavcodec/v4l2_m2m.h     |  31 +++
- libavcodec/v4l2_m2m_dec.c | 446 ++++++++++++++++++++++++++----
- 8 files changed, 1286 insertions(+), 354 deletions(-)
-
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 2d440b5648..e1aa0ba014 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -169,7 +169,8 @@ OBJS-$(CONFIG_VIDEODSP)                += videodsp.o
- OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
--OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o
-+OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
-+                                          weak_link.o
- OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
- 					  v4l2_req_devscan.o weak_link.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 3f5471067a..a003934ca1 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -21,6 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include <drm_fourcc.h>
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <sys/mman.h>
-@@ -29,12 +30,14 @@
- #include <poll.h>
- #include "libavcodec/avcodec.h"
- #include "libavutil/pixdesc.h"
-+#include "libavutil/hwcontext.h"
- #include "v4l2_context.h"
- #include "v4l2_buffers.h"
- #include "v4l2_m2m.h"
-+#include "weak_link.h"
- 
- #define USEC_PER_SEC 1000000
--static AVRational v4l2_timebase = { 1, USEC_PER_SEC };
-+static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
- 
- static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
- {
-@@ -51,34 +54,44 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
- static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
- {
-     V4L2m2mContext *s = buf_to_m2mctx(avbuf);
--
--    if (s->avctx->pkt_timebase.num)
--        return s->avctx->pkt_timebase;
--    return s->avctx->time_base;
-+    const AVRational tb = s->avctx->pkt_timebase.num ?
-+        s->avctx->pkt_timebase :
-+        s->avctx->time_base;
-+    return tb.num && tb.den ? tb : v4l2_timebase;
- }
- 
--static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts)
-+static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
- {
--    int64_t v4l2_pts;
--
--    if (pts == AV_NOPTS_VALUE)
--        pts = 0;
--
-     /* convert pts to v4l2 timebase */
--    v4l2_pts = av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-+    const int64_t v4l2_pts =
-+        no_rescale ? pts :
-+        pts == AV_NOPTS_VALUE ? 0 :
-+            av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
- }
- 
--static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf)
-+static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
- {
--    int64_t v4l2_pts;
--
-     /* convert pts back to encoder timebase */
--    v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-+    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                         avbuf->buf.timestamp.tv_usec;
- 
--    return av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
-+    return
-+        no_rescale ? v4l2_pts :
-+        v4l2_pts == 0 ? AV_NOPTS_VALUE :
-+            av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
-+}
-+
-+static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
-+{
-+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-+        out->planes[plane].bytesused = bytesused;
-+        out->planes[plane].length = length;
-+    } else {
-+        out->buf.bytesused = bytesused;
-+        out->buf.length = length;
-+    }
- }
- 
- static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-@@ -209,68 +222,143 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
-     return AVCOL_TRC_UNSPECIFIED;
- }
- 
--static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
- {
--    V4L2Buffer* avbuf = opaque;
--    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
- 
--    if (atomic_fetch_sub(&avbuf->context_refcount, 1) == 1) {
--        atomic_fetch_sub_explicit(&s->refcount, 1, memory_order_acq_rel);
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
- 
--        if (s->reinit) {
--            if (!atomic_load(&s->refcount))
--                sem_post(&s->refsync);
--        } else {
--            if (s->draining && V4L2_TYPE_IS_OUTPUT(avbuf->context->type)) {
--                /* no need to queue more buffers to the driver */
--                avbuf->status = V4L2BUF_AVAILABLE;
--            }
--            else if (avbuf->context->streamon)
--                ff_v4l2_buffer_enqueue(avbuf);
--        }
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
-+
-+    switch (avbuf->context->av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
-+
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = avbuf->context->av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            avbuf->context->format.fmt.pix.height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              avbuf->context->format.fmt.pix.height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
- 
--        av_buffer_unref(&avbuf->context_ref);
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-     }
-+
-+    return (uint8_t *) drm_desc;
- }
- 
--static int v4l2_buf_increase_ref(V4L2Buffer *in)
-+static void v4l2_free_bufref(void *opaque, uint8_t *data)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(in);
-+    AVBufferRef * bufref = (AVBufferRef *)data;
-+    V4L2Buffer *avbuf = (V4L2Buffer *)bufref->data;
-+    struct V4L2Context *ctx = ff_weak_link_lock(&avbuf->context_wl);
- 
--    if (in->context_ref)
--        atomic_fetch_add(&in->context_refcount, 1);
--    else {
--        in->context_ref = av_buffer_ref(s->self_ref);
--        if (!in->context_ref)
--            return AVERROR(ENOMEM);
-+    if (ctx != NULL) {
-+        // Buffer still attached to context
-+        V4L2m2mContext *s = buf_to_m2mctx(avbuf);
- 
--        in->context_refcount = 1;
--    }
-+        ff_mutex_lock(&ctx->lock);
- 
--    in->status = V4L2BUF_RET_USER;
--    atomic_fetch_add_explicit(&s->refcount, 1, memory_order_relaxed);
-+        avbuf->status = V4L2BUF_AVAILABLE;
- 
--    return 0;
-+        if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
-+            /* no need to queue more buffers to the driver */
-+        }
-+        else if (ctx->streamon) {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer requeue\n", ctx->name);
-+            avbuf->buf.timestamp.tv_sec = 0;
-+            avbuf->buf.timestamp.tv_usec = 0;
-+            ff_v4l2_buffer_enqueue(avbuf);  // will set to IN_DRIVER
-+        }
-+        else {
-+            av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer freed but streamoff\n", ctx->name);
-+        }
-+
-+        ff_mutex_unlock(&ctx->lock);
-+    }
-+
-+    ff_weak_link_unlock(avbuf->context_wl);
-+    av_buffer_unref(&bufref);
- }
- 
--static int v4l2_buf_to_bufref(V4L2Buffer *in, int plane, AVBufferRef **buf)
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
- {
--    int ret;
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
- 
--    if (plane >= in->num_planes)
--        return AVERROR(EINVAL);
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
- 
--    /* even though most encoders return 0 in data_offset encoding vp8 does require this value */
--    *buf = av_buffer_create((char *)in->plane_info[plane].mm_addr + in->planes[plane].data_offset,
--                            in->plane_info[plane].length, v4l2_free_buffer, in, 0);
--    if (!*buf)
--        return AVERROR(ENOMEM);
-+        expbuf.index = avbuf->buf.index;
-+        expbuf.type = avbuf->buf.type;
-+        expbuf.plane = i;
- 
--    ret = v4l2_buf_increase_ref(in);
--    if (ret)
--        av_buffer_unref(buf);
-+        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
- 
--    return ret;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        }
-+    }
-+
-+    return 0;
- }
- 
- static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
-@@ -285,30 +373,50 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
- 
-     memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
- 
--    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
--        out->planes[plane].bytesused = bytesused;
--        out->planes[plane].length = length;
--    } else {
--        out->buf.bytesused = bytesused;
--        out->buf.length = length;
--    }
-+    set_buf_length(out, plane, bytesused, length);
- 
-     return 0;
- }
- 
-+static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
-+{
-+    AVBufferRef * bufref = av_buffer_ref(avbuf->context->bufrefs[avbuf->buf.index]);
-+    AVBufferRef * newbuf;
-+
-+    if (!bufref)
-+        return NULL;
-+
-+    newbuf = av_buffer_create((uint8_t *)bufref, sizeof(*bufref), v4l2_free_bufref, NULL, 0);
-+    if (newbuf == NULL)
-+        av_buffer_unref(&bufref);
-+
-+    avbuf->status = V4L2BUF_RET_USER;
-+    return newbuf;
-+}
-+
- static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
- {
--    int i, ret;
-+    int i;
- 
-     frame->format = avbuf->context->av_pix_fmt;
- 
--    for (i = 0; i < avbuf->num_planes; i++) {
--        ret = v4l2_buf_to_bufref(avbuf, i, &frame->buf[i]);
--        if (ret)
--            return ret;
-+    frame->buf[0] = wrap_avbuf(avbuf);
-+    if (frame->buf[0] == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    if (buf_to_m2mctx(avbuf)->output_drm) {
-+        /* 1. get references to the actual data */
-+        frame->data[0] = (uint8_t *) v4l2_get_drm_frame(avbuf);
-+        frame->format = AV_PIX_FMT_DRM_PRIME;
-+        frame->hw_frames_ctx = av_buffer_ref(avbuf->context->frames_ref);
-+        return 0;
-+    }
-+
- 
-+    /* 1. get references to the actual data */
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        frame->data[i] = (uint8_t *)avbuf->plane_info[i].mm_addr + avbuf->planes[i].data_offset;
-         frame->linesize[i] = avbuf->plane_info[i].bytesperline;
--        frame->data[i] = frame->buf[i]->data;
-     }
- 
-     /* fixup special cases */
-@@ -337,68 +445,95 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
-     return 0;
- }
- 
-+static void cpy_2d(uint8_t * dst, int dst_stride, const uint8_t * src, int src_stride, int w, int h)
-+{
-+    if (dst_stride == src_stride && w + 32 >= dst_stride) {
-+        memcpy(dst, src, dst_stride * h);
-+    }
-+    else {
-+        while (--h >= 0) {
-+            memcpy(dst, src, w);
-+            dst += dst_stride;
-+            src += src_stride;
-+        }
-+    }
-+}
-+
-+static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
-+{
-+    return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
-+}
-+
- static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
--    int i, ret;
--    struct v4l2_format fmt = out->context->format;
--    int pixel_format = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
--                       fmt.fmt.pix_mp.pixelformat : fmt.fmt.pix.pixelformat;
--    int height       = V4L2_TYPE_IS_MULTIPLANAR(fmt.type) ?
--                       fmt.fmt.pix_mp.height : fmt.fmt.pix.height;
--    int is_planar_format = 0;
--
--    switch (pixel_format) {
--    case V4L2_PIX_FMT_YUV420M:
--    case V4L2_PIX_FMT_YVU420M:
--#ifdef V4L2_PIX_FMT_YUV422M
--    case V4L2_PIX_FMT_YUV422M:
--#endif
--#ifdef V4L2_PIX_FMT_YVU422M
--    case V4L2_PIX_FMT_YVU422M:
--#endif
--#ifdef V4L2_PIX_FMT_YUV444M
--    case V4L2_PIX_FMT_YUV444M:
--#endif
--#ifdef V4L2_PIX_FMT_YVU444M
--    case V4L2_PIX_FMT_YVU444M:
--#endif
--    case V4L2_PIX_FMT_NV12M:
--    case V4L2_PIX_FMT_NV21M:
--    case V4L2_PIX_FMT_NV12MT_16X16:
--    case V4L2_PIX_FMT_NV12MT:
--    case V4L2_PIX_FMT_NV16M:
--    case V4L2_PIX_FMT_NV61M:
--        is_planar_format = 1;
--    }
--
--    if (!is_planar_format) {
--        const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
--        int planes_nb = 0;
--        int offset = 0;
--
--        for (i = 0; i < desc->nb_components; i++)
--            planes_nb = FFMAX(planes_nb, desc->comp[i].plane + 1);
--
--        for (i = 0; i < planes_nb; i++) {
--            int size, h = height;
--            if (i == 1 || i == 2) {
-+    int i;
-+    int num_planes = 0;
-+    int pel_strides[4] = {0};
-+
-+    const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+
-+    if ((desc->flags & AV_PIX_FMT_FLAG_HWACCEL) != 0) {
-+        av_log(NULL, AV_LOG_ERROR, "%s: HWACCEL cannot be copied\n", __func__);
-+        return -1;
-+    }
-+
-+    for (i = 0; i != desc->nb_components; ++i) {
-+        if (desc->comp[i].plane >= num_planes)
-+            num_planes = desc->comp[i].plane + 1;
-+        pel_strides[desc->comp[i].plane] = desc->comp[i].step;
-+    }
-+
-+    if (out->num_planes > 1) {
-+        if (num_planes != out->num_planes) {
-+            av_log(NULL, AV_LOG_ERROR, "%s: Num planes mismatch: %d != %d\n", __func__, num_planes, out->num_planes);
-+            return -1;
-+        }
-+        for (i = 0; i != num_planes; ++i) {
-+            int w = frame->width;
-+            int h = frame->height;
-+            if (is_chroma(desc, i, num_planes)) {
-+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
-                 h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
-             }
--            size = frame->linesize[i] * h;
--            ret = v4l2_bufref_to_buf(out, 0, frame->data[i], size, offset);
--            if (ret)
--                return ret;
--            offset += size;
-+
-+            cpy_2d(out->plane_info[i].mm_addr, out->plane_info[i].bytesperline,
-+                   frame->data[i], frame->linesize[i],
-+                   w * pel_strides[i], h);
-+            set_buf_length(out, i, out->plane_info[i].bytesperline * h, out->plane_info[i].length);
-         }
--        return 0;
-     }
-+    else
-+    {
-+        unsigned int offset = 0;
-+
-+        for (i = 0; i != num_planes; ++i) {
-+            int w = frame->width;
-+            int h = frame->height;
-+            int dst_stride = out->plane_info[0].bytesperline;
-+            uint8_t * const dst = (uint8_t *)out->plane_info[0].mm_addr + offset;
-+
-+            if (is_chroma(desc, i, num_planes)) {
-+                // Is chroma
-+                dst_stride >>= desc->log2_chroma_w;
-+                offset += dst_stride * (out->context->height >> desc->log2_chroma_h);
-+                w = AV_CEIL_RSHIFT(w, desc->log2_chroma_w);
-+                h = AV_CEIL_RSHIFT(h, desc->log2_chroma_h);
-+            }
-+            else {
-+                // Is luma or alpha
-+                offset += dst_stride * out->context->height;
-+            }
-+            if (offset > out->plane_info[0].length) {
-+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
-+                return -1;
-+            }
- 
--    for (i = 0; i < out->num_planes; i++) {
--        ret = v4l2_bufref_to_buf(out, i, frame->buf[i]->data, frame->buf[i]->size, 0);
--        if (ret)
--            return ret;
-+            cpy_2d(dst, dst_stride,
-+                   frame->data[i], frame->linesize[i],
-+                   w * pel_strides[i], h);
-+        }
-+        set_buf_length(out, 0, offset, out->plane_info[0].length);
-     }
--
-     return 0;
- }
- 
-@@ -410,14 +545,15 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- 
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
--    v4l2_set_pts(out, frame->pts);
-+    v4l2_set_pts(out, frame->pts, 0);
- 
-     return v4l2_buffer_swframe_to_buf(frame, out);
- }
- 
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
- {
-     int ret;
-+    V4L2Context * const ctx = avbuf->context;
- 
-     av_frame_unref(frame);
- 
-@@ -432,13 +568,22 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-     frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-     frame->color_trc = v4l2_get_color_trc(avbuf);
--    frame->pts = v4l2_get_pts(avbuf);
-+    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
-     frame->pkt_dts = AV_NOPTS_VALUE;
- 
-     /* these values are updated also during re-init in v4l2_process_driver_event */
--    frame->height = avbuf->context->height;
--    frame->width = avbuf->context->width;
--    frame->sample_aspect_ratio = avbuf->context->sample_aspect_ratio;
-+    frame->height = ctx->height;
-+    frame->width = ctx->width;
-+    frame->sample_aspect_ratio = ctx->sample_aspect_ratio;
-+
-+    if (ctx->selection.height && ctx->selection.width) {
-+        frame->crop_left = ctx->selection.left < frame->width ? ctx->selection.left : 0;
-+        frame->crop_top  = ctx->selection.top < frame->height ? ctx->selection.top  : 0;
-+        frame->crop_right = ctx->selection.left + ctx->selection.width < frame->width ?
-+            frame->width - (ctx->selection.left + ctx->selection.width) : 0;
-+        frame->crop_bottom = ctx->selection.top + ctx->selection.height < frame->height ?
-+            frame->height - (ctx->selection.top + ctx->selection.height) : 0;
-+    }
- 
-     /* 3. report errors upstream */
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_ERROR) {
-@@ -451,15 +596,14 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- 
- int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
- {
--    int ret;
--
-     av_packet_unref(pkt);
--    ret = v4l2_buf_to_bufref(avbuf, 0, &pkt->buf);
--    if (ret)
--        return ret;
-+
-+    pkt->buf = wrap_avbuf(avbuf);
-+    if (pkt->buf == NULL)
-+        return AVERROR(ENOMEM);
- 
-     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
--    pkt->data = pkt->buf->data;
-+    pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
- 
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
-         pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -469,20 +613,27 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
-         pkt->flags |= AV_PKT_FLAG_CORRUPT;
-     }
- 
--    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
-+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
- 
-     return 0;
- }
- 
--int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts)
- {
-     int ret;
- 
--    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, 0);
-+    if (extlen) {
-+        ret = v4l2_bufref_to_buf(out, 0, extdata, extlen, 0);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
-     if (ret)
-         return ret;
- 
--    v4l2_set_pts(out, pkt->pts);
-+    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
- 
-     if (pkt->flags & AV_PKT_FLAG_KEY)
-         out->flags = V4L2_BUF_FLAG_KEYFRAME;
-@@ -490,15 +641,61 @@ int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-     return 0;
- }
- 
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-+int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-+{
-+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
-+}
-+
-+
-+static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
-+{
-+    V4L2Buffer * const avbuf = (V4L2Buffer *)data;
-+    int i;
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->plane_info); ++i) {
-+        struct V4L2Plane_info *p = avbuf->plane_info + i;
-+        if (p->mm_addr != NULL)
-+            munmap(p->mm_addr, p->length);
-+    }
-+
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-+        if (avbuf->drm_frame.objects[i].fd != -1)
-+            close(avbuf->drm_frame.objects[i].fd);
-+    }
-+
-+    ff_weak_link_unref(&avbuf->context_wl);
-+
-+    av_free(avbuf);
-+}
-+
-+
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
- {
--    V4L2Context *ctx = avbuf->context;
-     int ret, i;
-+    V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
-+    AVBufferRef * bufref;
-+
-+    *pbufref = NULL;
-+    if (avbuf == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    bufref = av_buffer_create((uint8_t*)avbuf, sizeof(*avbuf), v4l2_buffer_buffer_free, NULL, 0);
-+    if (bufref == NULL) {
-+        av_free(avbuf);
-+        return AVERROR(ENOMEM);
-+    }
- 
-+    avbuf->context = ctx;
-     avbuf->buf.memory = V4L2_MEMORY_MMAP;
-     avbuf->buf.type = ctx->type;
-     avbuf->buf.index = index;
- 
-+    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-+        avbuf->drm_frame.objects[i].fd = -1;
-+    }
-+
-+    avbuf->context_wl = ff_weak_link_ref(ctx->wl_master);
-+
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.length = VIDEO_MAX_PLANES;
-         avbuf->buf.m.planes = avbuf->planes;
-@@ -506,7 +703,7 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
-     if (ret < 0)
--        return AVERROR(errno);
-+        goto fail;
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->num_planes = 0;
-@@ -526,25 +723,33 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
- 
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
--                                           PROT_READ | PROT_WRITE, MAP_SHARED,
--                                           buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-+            }
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
--            avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
--                                          PROT_READ | PROT_WRITE, MAP_SHARED,
--                                          buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+
-+            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
-+                !buf_to_m2mctx(avbuf)->output_drm) {
-+                avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-+                                               PROT_READ | PROT_WRITE, MAP_SHARED,
-+                                               buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
-+            }
-         }
- 
--        if (avbuf->plane_info[i].mm_addr == MAP_FAILED)
--            return AVERROR(ENOMEM);
-+        if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
-+            avbuf->plane_info[i].mm_addr = NULL;
-+            ret = AVERROR(ENOMEM);
-+            goto fail;
-+        }
-     }
- 
-     avbuf->status = V4L2BUF_AVAILABLE;
- 
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        return 0;
--
-     if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-         avbuf->buf.m.planes = avbuf->planes;
-         avbuf->buf.length   = avbuf->num_planes;
-@@ -554,7 +759,20 @@ int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index)
-         avbuf->buf.length    = avbuf->planes[0].length;
-     }
- 
--    return ff_v4l2_buffer_enqueue(avbuf);
-+    if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (buf_to_m2mctx(avbuf)->output_drm) {
-+            ret = v4l2_buffer_export_drm(avbuf);
-+            if (ret)
-+                    goto fail;
-+        }
-+    }
-+
-+    *pbufref = bufref;
-+    return 0;
-+
-+fail:
-+    av_buffer_unref(&bufref);
-+    return ret;
- }
- 
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-@@ -563,9 +781,27 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
- 
-     avbuf->buf.flags = avbuf->flags;
- 
-+    if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
-+        av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-+               avbuf->context->name, avbuf->buf.index,
-+               avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-+               avbuf->context->q_count);
-+    }
-+
-     ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QBUF, &avbuf->buf);
--    if (ret < 0)
--        return AVERROR(errno);
-+    if (ret < 0) {
-+        int err = errno;
-+        av_log(logger(avbuf), AV_LOG_ERROR, "--- %s VIDIOC_QBUF: index %d FAIL err %d (%s)\n",
-+               avbuf->context->name, avbuf->buf.index,
-+               err, strerror(err));
-+        return AVERROR(err);
-+    }
-+
-+    ++avbuf->context->q_count;
-+    av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-+           avbuf->context->name, avbuf->buf.index,
-+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
-+           avbuf->context->q_count);
- 
-     avbuf->status = V4L2BUF_IN_DRIVER;
- 
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 3d2ff1b9a5..111526aee3 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -28,27 +28,37 @@
- #include <stddef.h>
- #include <linux/videodev2.h>
- 
-+#include "avcodec.h"
- #include "libavutil/buffer.h"
- #include "libavutil/frame.h"
-+#include "libavutil/hwcontext_drm.h"
- #include "packet.h"
- 
- enum V4L2Buffer_status {
-     V4L2BUF_AVAILABLE,
-     V4L2BUF_IN_DRIVER,
-+    V4L2BUF_IN_USE,
-     V4L2BUF_RET_USER,
- };
- 
- /**
-  * V4L2Buffer (wrapper for v4l2_buffer management)
-  */
-+struct V4L2Context;
-+struct ff_weak_link_client;
-+
- typedef struct V4L2Buffer {
--    /* each buffer needs to have a reference to its context */
-+    /* each buffer needs to have a reference to its context
-+     * The pointer is good enough for most operation but once the buffer has
-+     * been passed to the user the buffer may become orphaned so for free ops
-+     * the weak link must be used to ensure that the context is actually
-+     * there
-+     */
-     struct V4L2Context *context;
-+    struct ff_weak_link_client *context_wl;
- 
--    /* This object is refcounted per-plane, so we need to keep track
--     * of how many context-refs we are holding. */
--    AVBufferRef *context_ref;
--    atomic_uint context_refcount;
-+    /* DRM descriptor */
-+    AVDRMFrameDescriptor drm_frame;
- 
-     /* keep track of the mmap address and mmap length */
-     struct V4L2Plane_info {
-@@ -73,11 +83,12 @@ typedef struct V4L2Buffer {
-  *
-  * @param[in] frame The AVFRame to push the information to
-  * @param[in] buf The V4L2Buffer to get the information from
-+ * @param[in] no_rescale_pts If non-zero do not rescale PTS
-  *
-  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
-  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
-  */
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
- 
- /**
-  * Extracts the data from a V4L2Buffer to an AVPacket
-@@ -101,6 +112,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
-  */
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
- 
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-+                                    const void *extdata, size_t extlen, int no_rescale_pts);
-+
- /**
-  * Extracts the data from an AVFrame to a V4L2Buffer
-  *
-@@ -119,7 +133,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
-  *
-  * @returns 0 in case of success, a negative AVERROR code otherwise
-  */
--int ff_v4l2_buffer_initialize(V4L2Buffer* avbuf, int index);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
- 
- /**
-  * Enqueues a V4L2Buffer
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index a40be94690..be76068af3 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -27,11 +27,13 @@
- #include <unistd.h>
- #include <fcntl.h>
- #include <poll.h>
-+#include "libavutil/avassert.h"
- #include "libavcodec/avcodec.h"
- #include "decode.h"
- #include "v4l2_buffers.h"
- #include "v4l2_fmt.h"
- #include "v4l2_m2m.h"
-+#include "weak_link.h"
- 
- struct v4l2_format_update {
-     uint32_t v4l2_fmt;
-@@ -153,21 +155,99 @@ static inline void v4l2_save_to_context(V4L2Context* ctx, struct v4l2_format_upd
-     }
- }
- 
--static int v4l2_start_decode(V4L2Context *ctx)
-+static int get_default_selection(V4L2Context * const ctx, struct v4l2_rect *r)
- {
--    struct v4l2_decoder_cmd cmd = {
--        .cmd = V4L2_DEC_CMD_START,
--        .flags = 0,
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    struct v4l2_selection selection = {
-+        .type = V4L2_BUF_TYPE_VIDEO_CAPTURE,
-+        .target = V4L2_SEL_TGT_COMPOSE
-     };
--    int ret;
- 
--    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DECODER_CMD, &cmd);
--    if (ret)
-+    memset(r, 0, sizeof(*r));
-+    if (ioctl(s->fd, VIDIOC_G_SELECTION, &selection))
-         return AVERROR(errno);
- 
-+    *r = selection.r;
-     return 0;
- }
- 
-+static int do_source_change(V4L2m2mContext * const s)
-+{
-+    AVCodecContext *const avctx = s->avctx;
-+
-+    int ret;
-+    int reinit;
-+    int full_reinit;
-+    struct v4l2_format cap_fmt = s->capture.format;
-+
-+    s->resize_pending = 0;
-+    s->capture.done = 0;
-+
-+    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
-+    if (ret) {
-+        av_log(avctx, AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", s->capture.name);
-+        return 0;
-+    }
-+
-+    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
-+
-+    get_default_selection(&s->capture, &s->capture.selection);
-+
-+    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
-+    if (reinit) {
-+        s->capture.height = v4l2_get_height(&cap_fmt);
-+        s->capture.width = v4l2_get_width(&cap_fmt);
-+    }
-+    s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
-+           s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
-+           s->capture.selection.width, s->capture.selection.height,
-+           s->capture.selection.left, s->capture.selection.top);
-+
-+    s->reinit = 1;
-+
-+    if (reinit) {
-+        if (avctx)
-+            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
-+        if (ret < 0)
-+            av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
-+
-+        ret = ff_v4l2_m2m_codec_reinit(s);
-+        if (ret) {
-+            av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-+            return AVERROR(EINVAL);
-+        }
-+        goto reinit_run;
-+    }
-+
-+    /* Buffers are OK so just stream off to ack */
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
-+
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+    if (ret)
-+        av_log(avctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF failed\n");
-+    s->draining = 0;
-+
-+    /* reinit executed */
-+reinit_run:
-+    ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMON);
-+    return 1;
-+}
-+
-+static int ctx_done(V4L2Context * const ctx)
-+{
-+    int rv = 0;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+
-+    ctx->done = 1;
-+
-+    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        rv = do_source_change(s);
-+
-+    return rv;
-+}
-+
- /**
-  * handle resolution change event and end of stream event
-  * returns 1 if reinit was successful, negative if it failed
-@@ -175,8 +255,7 @@ static int v4l2_start_decode(V4L2Context *ctx)
-  */
- static int v4l2_handle_event(V4L2Context *ctx)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
--    struct v4l2_format cap_fmt = s->capture.format;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-     struct v4l2_event evt = { 0 };
-     int ret;
- 
-@@ -186,44 +265,22 @@ static int v4l2_handle_event(V4L2Context *ctx)
-         return 0;
-     }
- 
-+    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
-+
-     if (evt.type == V4L2_EVENT_EOS) {
--        ctx->done = 1;
-+//        ctx->done = 1;
-+        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
-         return 0;
-     }
- 
-     if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
-         return 0;
- 
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
--    if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT\n", s->capture.name);
--        return 0;
--    }
--
--    if (v4l2_resolution_changed(&s->capture, &cap_fmt)) {
--        s->capture.height = v4l2_get_height(&cap_fmt);
--        s->capture.width = v4l2_get_width(&cap_fmt);
--        s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
--    } else {
--        v4l2_start_decode(ctx);
-+    s->resize_pending = 1;
-+    if (!ctx->done)
-         return 0;
--    }
--
--    s->reinit = 1;
--
--    if (s->avctx)
--        ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
--    if (ret < 0)
--        av_log(logger(ctx), AV_LOG_WARNING, "update avcodec height and width\n");
--
--    ret = ff_v4l2_m2m_codec_reinit(s);
--    if (ret) {
--        av_log(logger(ctx), AV_LOG_ERROR, "v4l2_m2m_codec_reinit\n");
--        return AVERROR(EINVAL);
--    }
- 
--    /* reinit executed */
--    return 1;
-+    return do_source_change(s);
- }
- 
- static int v4l2_stop_decode(V4L2Context *ctx)
-@@ -266,8 +323,26 @@ static int v4l2_stop_encode(V4L2Context *ctx)
-     return 0;
- }
- 
-+static int count_in_driver(const V4L2Context * const ctx)
-+{
-+    int i;
-+    int n = 0;
-+
-+    if (!ctx->bufrefs)
-+        return -1;
-+
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (avbuf->status == V4L2BUF_IN_DRIVER)
-+            ++n;
-+    }
-+    return n;
-+}
-+
- static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
- {
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
-     struct v4l2_buffer buf = { 0 };
-     V4L2Buffer *avbuf;
-@@ -276,50 +351,84 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
-         .fd = ctx_to_m2mctx(ctx)->fd,
-     };
-     int i, ret;
-+    int no_rx_means_done = 0;
- 
--    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx->buffers) {
-+    if (is_capture && ctx->bufrefs) {
-         for (i = 0; i < ctx->num_buffers; i++) {
--            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 break;
-         }
-         if (i == ctx->num_buffers)
--            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers returned to "
-+            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
-                                                 "userspace. Increase num_capture_buffers "
-                                                 "to prevent device deadlock or dropped "
--                                                "packets/frames.\n");
-+                                                "packets/frames.\n", i);
-     }
- 
-+#if 0
-+    // I think this is true but pointless
-+    // we will get some other form of EOF signal
-+
-     /* if we are draining and there are no more capture buffers queued in the driver we are done */
--    if (!V4L2_TYPE_IS_OUTPUT(ctx->type) && ctx_to_m2mctx(ctx)->draining) {
-+    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
-         for (i = 0; i < ctx->num_buffers; i++) {
-             /* capture buffer initialization happens during decode hence
-              * detection happens at runtime
-              */
--            if (!ctx->buffers)
-+            if (!ctx->bufrefs)
-                 break;
- 
--            if (ctx->buffers[i].status == V4L2BUF_IN_DRIVER)
-+            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+            if (avbuf->status == V4L2BUF_IN_DRIVER)
-                 goto start;
-         }
-         ctx->done = 1;
-         return NULL;
-     }
-+#endif
- 
- start:
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        pfd.events =  POLLOUT | POLLWRNORM;
--    else {
-+    if (is_capture) {
-         /* no need to listen to requests for more input while draining */
-         if (ctx_to_m2mctx(ctx)->draining)
-             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-+    } else {
-+        pfd.events =  POLLOUT | POLLWRNORM;
-     }
-+    no_rx_means_done = s->resize_pending && is_capture;
- 
-     for (;;) {
--        ret = poll(&pfd, 1, timeout);
-+        // If we have a resize pending then all buffers should be Qed
-+        // With a resize pending we should be in drain but evidence suggests
-+        // that not all decoders do this so poll to clear
-+        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
-+        const int e = pfd.events;
-+
-+        ret = poll(&pfd, 1, t2);
-+
-         if (ret > 0)
-             break;
--        if (errno == EINTR)
--            continue;
-+
-+        if (ret < 0) {
-+            int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
-+                   err, strerror(err),
-+                   e, count_in_driver(ctx));
-+            return NULL;
-+        }
-+
-+        // ret == 0 (timeout)
-+        if (no_rx_means_done) {
-+            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
-+            ret = ctx_done(ctx);
-+            if (ret > 0)
-+                goto start;
-+        }
-+        if (timeout == -1)
-+            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
-         return NULL;
-     }
- 
-@@ -329,7 +438,8 @@ start:
-            no need to raise a warning */
-         if (timeout == 0) {
-             for (i = 0; i < ctx->num_buffers; i++) {
--                if (ctx->buffers[i].status != V4L2BUF_AVAILABLE)
-+                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+                if (avbuf->status != V4L2BUF_AVAILABLE)
-                     av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-             }
-         }
-@@ -347,22 +457,25 @@ start:
-             ctx->done = 1;
-             return NULL;
-         }
--        if (ret) {
--            /* if re-init was successful drop the buffer (if there was one)
--             * since we had to reconfigure capture (unmap all buffers)
--             */
--            return NULL;
--        }
-+        if (ret > 0)
-+            goto start;
-     }
- 
-     /* 2. dequeue the buffer */
-     if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
- 
--        if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        if (is_capture) {
-             /* there is a capture buffer ready */
-             if (pfd.revents & (POLLIN | POLLRDNORM))
-                 goto dequeue;
- 
-+            // CAPTURE Q drained
-+            if (no_rx_means_done) {
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-+                return NULL;
-+            }
-+
-             /* the driver is ready to accept more input; instead of waiting for the capture
-              * buffer to complete we return NULL so input can proceed (we are single threaded)
-              */
-@@ -380,37 +493,58 @@ dequeue:
-             buf.m.planes = planes;
-         }
- 
--        ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf);
--        if (ret) {
--            if (errno != EAGAIN) {
--                ctx->done = 1;
--                if (errno != EPIPE)
-+        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
-+            const int err = errno;
-+            if (err == EINTR)
-+                continue;
-+            if (err != EAGAIN) {
-+                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
-+                if (err != EPIPE || !is_capture)
-                     av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
--                        ctx->name, av_err2str(AVERROR(errno)));
-+                        ctx->name, av_err2str(AVERROR(err)));
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-             }
-             return NULL;
-         }
-+        --ctx->q_count;
-+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
-+               ctx->name, buf.index,
-+               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
-+               ctx->q_count, ++ctx->dq_count);
- 
--        if (ctx_to_m2mctx(ctx)->draining && !V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-+        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+        avbuf->status = V4L2BUF_AVAILABLE;
-+        avbuf->buf = buf;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-+            memcpy(avbuf->planes, planes, sizeof(planes));
-+            avbuf->buf.m.planes = avbuf->planes;
-+        }
-+
-+        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
-             int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
-                             buf.m.planes[0].bytesused : buf.bytesused;
-             if (bytesused == 0) {
--                ctx->done = 1;
-+                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
-+
-+                // Must reQ so we don't leak
-+                // May not matter if the next thing we do is release all the
-+                // buffers but better to be tidy.
-+                ff_v4l2_buffer_enqueue(avbuf);
-+
-+                if (ctx_done(ctx) > 0)
-+                    goto start;
-                 return NULL;
-             }
- #ifdef V4L2_BUF_FLAG_LAST
--            if (buf.flags & V4L2_BUF_FLAG_LAST)
--                ctx->done = 1;
-+            if (buf.flags & V4L2_BUF_FLAG_LAST) {
-+                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
-+                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
-+                ctx_done(ctx);
-+            }
- #endif
-         }
- 
--        avbuf = &ctx->buffers[buf.index];
--        avbuf->status = V4L2BUF_AVAILABLE;
--        avbuf->buf = buf;
--        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--            memcpy(avbuf->planes, planes, sizeof(planes));
--            avbuf->buf.m.planes = avbuf->planes;
--        }
-         return avbuf;
-     }
- 
-@@ -429,8 +563,9 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
-     }
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
--        if (ctx->buffers[i].status == V4L2BUF_AVAILABLE)
--            return &ctx->buffers[i];
-+        V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (avbuf->status == V4L2BUF_AVAILABLE)
-+            return avbuf;
-     }
- 
-     return NULL;
-@@ -438,25 +573,45 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- 
- static int v4l2_release_buffers(V4L2Context* ctx)
- {
--    struct v4l2_requestbuffers req = {
--        .memory = V4L2_MEMORY_MMAP,
--        .type = ctx->type,
--        .count = 0, /* 0 -> unmaps buffers from the driver */
--    };
--    int i, j;
-+    int i;
-+    int ret = 0;
-+    const int fd = ctx_to_m2mctx(ctx)->fd;
- 
--    for (i = 0; i < ctx->num_buffers; i++) {
--        V4L2Buffer *buffer = &ctx->buffers[i];
-+    // Orphan any buffers in the wild
-+    ff_weak_link_break(&ctx->wl_master);
-+
-+    if (ctx->bufrefs) {
-+        for (i = 0; i < ctx->num_buffers; i++)
-+            av_buffer_unref(ctx->bufrefs + i);
-+    }
-+
-+    if (fd != -1) {
-+        struct v4l2_requestbuffers req = {
-+            .memory = V4L2_MEMORY_MMAP,
-+            .type = ctx->type,
-+            .count = 0, /* 0 -> unmap all buffers from the driver */
-+        };
-+
-+        while ((ret = ioctl(fd, VIDIOC_REQBUFS, &req)) == -1) {
-+            if (errno == EINTR)
-+                continue;
-+
-+            ret = AVERROR(errno);
- 
--        for (j = 0; j < buffer->num_planes; j++) {
--            struct V4L2Plane_info *p = &buffer->plane_info[j];
--            if (p->mm_addr && p->length)
--                if (munmap(p->mm_addr, p->length) < 0)
--                    av_log(logger(ctx), AV_LOG_ERROR, "%s unmap plane (%s))\n", ctx->name, av_err2str(AVERROR(errno)));
-+            av_log(logger(ctx), AV_LOG_ERROR, "release all %s buffers (%s)\n",
-+                ctx->name, av_err2str(AVERROR(errno)));
-+
-+            if (ctx_to_m2mctx(ctx)->output_drm)
-+                av_log(logger(ctx), AV_LOG_ERROR,
-+                    "Make sure the DRM client releases all FB/GEM objects before closing the codec (ie):\n"
-+                    "for all buffers: \n"
-+                    "  1. drmModeRmFB(..)\n"
-+                    "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
-         }
-     }
-+    ctx->q_count = 0;
- 
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_REQBUFS, &req);
-+    return ret;
- }
- 
- static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfmt)
-@@ -485,6 +640,8 @@ static inline int v4l2_try_raw_format(V4L2Context* ctx, enum AVPixelFormat pixfm
- 
- static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
- {
-+    V4L2m2mContext* s = ctx_to_m2mctx(ctx);
-+    V4L2m2mPriv *priv = s->avctx->priv_data;
-     enum AVPixelFormat pixfmt = ctx->av_pix_fmt;
-     struct v4l2_fmtdesc fdesc;
-     int ret;
-@@ -503,6 +660,13 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
-         if (ret)
-             return AVERROR(EINVAL);
- 
-+        if (priv->pix_fmt != AV_PIX_FMT_NONE) {
-+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
-+                fdesc.index++;
-+                continue;
-+            }
-+        }
-+
-         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
-         ret = v4l2_try_raw_format(ctx, pixfmt);
-         if (ret){
-@@ -555,18 +719,73 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
-   *
-   *****************************************************************************/
- 
-+
-+static void flush_all_buffers_status(V4L2Context* const ctx)
-+{
-+    int i;
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (buf->status == V4L2BUF_IN_DRIVER)
-+            buf->status = V4L2BUF_AVAILABLE;
-+    }
-+    ctx->q_count = 0;
-+}
-+
-+static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
-+{
-+    int i;
-+    int rv;
-+
-+    if (!ctx->bufrefs) {
-+        rv = ff_v4l2_context_init(ctx);
-+        if (rv) {
-+            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
-+            return rv;
-+        }
-+    }
-+
-+    for (i = 0; i < ctx->num_buffers; ++i) {
-+        struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-+        if (buf->status == V4L2BUF_AVAILABLE) {
-+            rv = ff_v4l2_buffer_enqueue(buf);
-+            if (rv < 0)
-+                return rv;
-+        }
-+    }
-+    return 0;
-+}
-+
- int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
- {
-     int type = ctx->type;
-     int ret;
-+    AVCodecContext * const avctx = logger(ctx);
-+
-+    ff_mutex_lock(&ctx->lock);
-+
-+    if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        stuff_all_buffers(avctx, ctx);
- 
-     ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
--    if (ret < 0)
--        return AVERROR(errno);
-+    if (ret < 0) {
-+        const int err = errno;
-+        av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
-+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
-+        ret = AVERROR(err);
-+    }
-+    else
-+    {
-+        if (cmd == VIDIOC_STREAMOFF)
-+            flush_all_buffers_status(ctx);
- 
--    ctx->streamon = (cmd == VIDIOC_STREAMON);
-+        ctx->streamon = (cmd == VIDIOC_STREAMON);
-+        av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
-+               cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
-+    }
- 
--    return 0;
-+    ff_mutex_unlock(&ctx->lock);
-+
-+    return ret;
- }
- 
- int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-@@ -594,7 +813,8 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
--int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-+                                   const void * extdata, size_t extlen, int no_rescale_pts)
- {
-     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-     V4L2Buffer* avbuf;
-@@ -602,8 +822,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
- 
-     if (!pkt->size) {
-         ret = v4l2_stop_decode(ctx);
-+        // Log but otherwise ignore stop failure
-         if (ret)
--            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode\n", ctx->name);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
-         s->draining = 1;
-         return 0;
-     }
-@@ -612,14 +833,14 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt)
-     if (!avbuf)
-         return AVERROR(EAGAIN);
- 
--    ret = ff_v4l2_buffer_avpkt_to_buf(pkt, avbuf);
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
-     if (ret)
-         return ret;
- 
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
- {
-     V4L2Buffer *avbuf;
- 
-@@ -636,7 +857,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
-         return AVERROR(EAGAIN);
-     }
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-@@ -695,54 +916,57 @@ void ff_v4l2_context_release(V4L2Context* ctx)
- {
-     int ret;
- 
--    if (!ctx->buffers)
-+    if (!ctx->bufrefs)
-         return;
- 
-     ret = v4l2_release_buffers(ctx);
-     if (ret)
-         av_log(logger(ctx), AV_LOG_WARNING, "V4L2 failed to unmap the %s buffers\n", ctx->name);
- 
--    av_freep(&ctx->buffers);
-+    av_freep(&ctx->bufrefs);
-+    av_buffer_unref(&ctx->frames_ref);
-+
-+    ff_mutex_destroy(&ctx->lock);
- }
- 
--int ff_v4l2_context_init(V4L2Context* ctx)
-+
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-     struct v4l2_requestbuffers req;
--    int ret, i;
--
--    if (!v4l2_type_supported(ctx)) {
--        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
--        return AVERROR_PATCHWELCOME;
--    }
--
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
--    if (ret)
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed\n", ctx->name);
-+    int ret;
-+    int i;
- 
-     memset(&req, 0, sizeof(req));
--    req.count = ctx->num_buffers;
-+    req.count = req_buffers;
-     req.memory = V4L2_MEMORY_MMAP;
-     req.type = ctx->type;
--    ret = ioctl(s->fd, VIDIOC_REQBUFS, &req);
--    if (ret < 0) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, strerror(errno));
--        return AVERROR(errno);
-+    while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
-+        if (errno != EINTR) {
-+            ret = AVERROR(errno);
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_REQBUFS failed: %s\n", ctx->name, av_err2str(ret));
-+            return ret;
-+        }
-     }
- 
-     ctx->num_buffers = req.count;
--    ctx->buffers = av_mallocz(ctx->num_buffers * sizeof(V4L2Buffer));
--    if (!ctx->buffers) {
-+    ctx->bufrefs = av_mallocz(ctx->num_buffers * sizeof(*ctx->bufrefs));
-+    if (!ctx->bufrefs) {
-         av_log(logger(ctx), AV_LOG_ERROR, "%s malloc enomem\n", ctx->name);
--        return AVERROR(ENOMEM);
-+        goto fail_release;
-     }
- 
--    for (i = 0; i < req.count; i++) {
--        ctx->buffers[i].context = ctx;
--        ret = ff_v4l2_buffer_initialize(&ctx->buffers[i], i);
--        if (ret < 0) {
-+    ctx->wl_master = ff_weak_link_new(ctx);
-+    if (!ctx->wl_master) {
-+        ret = AVERROR(ENOMEM);
-+        goto fail_release;
-+    }
-+
-+    for (i = 0; i < ctx->num_buffers; i++) {
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
-+        if (ret) {
-             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
--            goto error;
-+            goto fail_release;
-         }
-     }
- 
-@@ -756,10 +980,62 @@ int ff_v4l2_context_init(V4L2Context* ctx)
- 
-     return 0;
- 
--error:
-+fail_release:
-     v4l2_release_buffers(ctx);
-+    av_freep(&ctx->bufrefs);
-+    return ret;
-+}
-+
-+int ff_v4l2_context_init(V4L2Context* ctx)
-+{
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-+    int ret;
-+
-+    // It is not valid to reinit a context without a previous release
-+    av_assert0(ctx->bufrefs == NULL);
-+
-+    if (!v4l2_type_supported(ctx)) {
-+        av_log(logger(ctx), AV_LOG_ERROR, "type %i not supported\n", ctx->type);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+
-+    ff_mutex_init(&ctx->lock, NULL);
- 
--    av_freep(&ctx->buffers);
-+    if (s->output_drm) {
-+        AVHWFramesContext *hwframes;
-+
-+        ctx->frames_ref = av_hwframe_ctx_alloc(s->device_ref);
-+        if (!ctx->frames_ref) {
-+            ret = AVERROR(ENOMEM);
-+            goto fail_unlock;
-+        }
-+
-+        hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
-+        hwframes->format = AV_PIX_FMT_DRM_PRIME;
-+        hwframes->sw_format = ctx->av_pix_fmt;
-+        hwframes->width = ctx->width;
-+        hwframes->height = ctx->height;
-+        ret = av_hwframe_ctx_init(ctx->frames_ref);
-+        if (ret < 0)
-+            goto fail_unref_hwframes;
-+    }
-+
-+    ret = ioctl(s->fd, VIDIOC_G_FMT, &ctx->format);
-+    if (ret) {
-+        ret = AVERROR(errno);
-+        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_G_FMT failed: %s\n", ctx->name, av_err2str(ret));
-+        goto fail_unref_hwframes;
-+    }
-+
-+    ret = create_buffers(ctx, ctx->num_buffers);
-+    if (ret < 0)
-+        goto fail_unref_hwframes;
-+
-+    return 0;
- 
-+fail_unref_hwframes:
-+    av_buffer_unref(&ctx->frames_ref);
-+fail_unlock:
-+    ff_mutex_destroy(&ctx->lock);
-     return ret;
- }
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 6f7460c89a..59009d11d1 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -32,6 +32,8 @@
- #include "libavutil/rational.h"
- #include "codec_id.h"
- #include "packet.h"
-+#include "libavutil/buffer.h"
-+#include "libavutil/thread.h"
- #include "v4l2_buffers.h"
- 
- typedef struct V4L2Context {
-@@ -71,11 +73,12 @@ typedef struct V4L2Context {
-      */
-     int width, height;
-     AVRational sample_aspect_ratio;
-+    struct v4l2_rect selection;
- 
-     /**
--     * Indexed array of V4L2Buffers
-+     * Indexed array of pointers to V4L2Buffers
-      */
--    V4L2Buffer *buffers;
-+    AVBufferRef **bufrefs;
- 
-     /**
-      * Readonly after init.
-@@ -93,6 +96,12 @@ typedef struct V4L2Context {
-      */
-     int done;
- 
-+    AVBufferRef *frames_ref;
-+    int q_count;
-+    int dq_count;
-+    struct ff_weak_link_master *wl_master;
-+
-+    AVMutex lock;
- } V4L2Context;
- 
- /**
-@@ -157,9 +166,12 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
-  * @param[in] ctx The V4L2Context to dequeue from.
-  * @param[inout] f The AVFrame to dequeue to.
-  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-+ * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
-+ *       timestamp directly)
-+ *
-  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
-  */
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
- 
- /**
-  * Enqueues a buffer to a V4L2Context from an AVPacket
-@@ -171,7 +183,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
-  * @param[in] pkt A pointer to an AVPacket.
-  * @return 0 in case of success, a negative error otherwise.
-  */
--int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt);
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
- 
- /**
-  * Enqueues a buffer to a V4L2Context from an AVFrame
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 602efb7a16..516e6d9858 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -216,13 +216,7 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
-         av_log(log_ctx, AV_LOG_ERROR, "capture VIDIOC_STREAMOFF\n");
- 
-     /* 2. unmap the capture buffers (v4l2 and ffmpeg):
--     *    we must wait for all references to be released before being allowed
--     *    to queue new buffers.
-      */
--    av_log(log_ctx, AV_LOG_DEBUG, "waiting for user to release AVBufferRefs\n");
--    if (atomic_load(&s->refcount))
--        while(sem_wait(&s->refsync) == -1 && errno == EINTR);
--
-     ff_v4l2_context_release(&s->capture);
- 
-     /* 3. get the new capture format */
-@@ -259,6 +253,8 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
-     av_frame_free(&s->frame);
-     av_packet_unref(&s->buf_pkt);
- 
-+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
-+
-     av_free(s);
- }
- 
-@@ -270,6 +266,11 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
-     if (!s)
-         return 0;
- 
-+    av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
-+
-+    if (av_codec_is_decoder(s->avctx->codec))
-+        av_packet_unref(&s->buf_pkt);
-+
-     if (s->fd >= 0) {
-         ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMOFF);
-         if (ret)
-@@ -282,7 +283,14 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
- 
-     ff_v4l2_context_release(&s->output);
- 
-+    close(s->fd);
-+    s->fd = -1;
-+
-     s->self_ref = NULL;
-+    // This is only called on avctx close so after this point we don't have that
-+    // Crash sooner if we find we are using it (can still log with avctx = NULL)
-+    s->avctx = NULL;
-+    priv->context = NULL;
-     av_buffer_unref(&priv->context_ref);
- 
-     return 0;
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 04d86d7b92..24a9c94864 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -30,6 +30,7 @@
- #include <linux/videodev2.h>
- 
- #include "libavcodec/avcodec.h"
-+#include "libavutil/pixfmt.h"
- #include "v4l2_context.h"
- 
- #define container_of(ptr, type, member) ({ \
-@@ -40,6 +41,17 @@
-     { "num_output_buffers", "Number of buffers in the output context",\
-         OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 16 }, 2, INT_MAX, FLAGS }
- 
-+#define FF_V4L2_M2M_TRACK_SIZE 128
-+typedef struct V4L2m2mTrackEl {
-+    int     discard;   // If we see this buffer its been flushed, so discard
-+    int     pkt_size;
-+    int64_t pts;
-+    int64_t reordered_opaque;
-+    int64_t pkt_pos;
-+    int64_t pkt_duration;
-+    int64_t track_pts;
-+} V4L2m2mTrackEl;
-+
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-     int fd;
-@@ -53,6 +65,7 @@ typedef struct V4L2m2mContext {
-     sem_t refsync;
-     atomic_uint refcount;
-     int reinit;
-+    int resize_pending;
- 
-     /* null frame/packet received */
-     int draining;
-@@ -66,6 +79,23 @@ typedef struct V4L2m2mContext {
- 
-     /* reference back to V4L2m2mPriv */
-     void *priv;
-+
-+    AVBufferRef *device_ref;
-+
-+    /* generate DRM frames */
-+    int output_drm;
-+
-+    /* Frame tracking */
-+    int64_t last_pkt_dts;
-+    int64_t last_opaque;
-+    unsigned int track_no;
-+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
-+
-+    /* req pkt */
-+    int req_pkt;
-+
-+    /* Ext data sent */
-+    int extdata_sent;
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv {
-@@ -76,6 +106,7 @@ typedef struct V4L2m2mPriv {
- 
-     int num_output_buffers;
-     int num_capture_buffers;
-+    enum AVPixelFormat pix_fmt;
- } V4L2m2mPriv;
- 
- /**
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 4944d08511..7f6033ac2c 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -23,6 +23,10 @@
- 
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
- #include "libavutil/pixfmt.h"
- #include "libavutil/pixdesc.h"
- #include "libavutil/opt.h"
-@@ -30,26 +34,51 @@
- #include "codec_internal.h"
- #include "libavcodec/decode.h"
- 
-+#include "libavcodec/hwaccels.h"
-+#include "libavcodec/internal.h"
-+#include "libavcodec/hwconfig.h"
-+
- #include "v4l2_context.h"
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
- 
-+static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
-+{
-+    int ret;
-+    struct v4l2_decoder_cmd cmd = {
-+        .cmd = V4L2_DEC_CMD_START,
-+        .flags = 0,
-+    };
-+
-+    if (s->output.streamon)
-+        return 0;
-+
-+    ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
-+
-+    if (!s->capture.streamon || ret < 0)
-+        return ret;
-+
-+    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-+    else
-+        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
-+
-+    return ret;
-+}
-+
- static int v4l2_try_start(AVCodecContext *avctx)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
-     struct v4l2_selection selection = { 0 };
-     int ret;
- 
-     /* 1. start the output process */
--    if (!output->streamon) {
--        ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMON);
--        if (ret < 0) {
--            av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON on output context\n");
--            return ret;
--        }
--    }
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
- 
-     if (capture->streamon)
-         return 0;
-@@ -63,15 +92,29 @@ static int v4l2_try_start(AVCodecContext *avctx)
-     }
- 
-     /* 2.1 update the AVCodecContext */
--    avctx->pix_fmt = ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
--    capture->av_pix_fmt = avctx->pix_fmt;
-+    capture->av_pix_fmt =
-+        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
-+    if (s->output_drm) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+        avctx->sw_pix_fmt = capture->av_pix_fmt;
-+    }
-+    else
-+        avctx->pix_fmt = capture->av_pix_fmt;
- 
-     /* 3. set the crop parameters */
-+#if 1
-+    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
-+    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-+    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+#else
-     selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-     selection.r.height = avctx->coded_height;
-     selection.r.width = avctx->coded_width;
-+    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
-     ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
--    if (!ret) {
-+    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
-+    if (1) {
-         ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
-         if (ret) {
-             av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
-@@ -82,15 +125,7 @@ static int v4l2_try_start(AVCodecContext *avctx)
-             capture->width  = selection.r.width;
-         }
-     }
--
--    /* 4. init the capture context now that we have the capture format */
--    if (!capture->buffers) {
--        ret = ff_v4l2_context_init(capture);
--        if (ret) {
--            av_log(avctx, AV_LOG_ERROR, "can't request capture buffers\n");
--            return AVERROR(ENOMEM);
--        }
--    }
-+#endif
- 
-     /* 5. start the capture process */
-     ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-@@ -133,50 +168,287 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
-     return 0;
- }
- 
--static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-+{
-+    return (int64_t)n;
-+}
-+
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-+{
-+    return (unsigned int)pts;
-+}
-+
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
-+static void
-+xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
-+{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++s->track_no == 0)
-+        s->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, s->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
-+    s->last_pkt_dts = avpkt->dts;
-+    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    avpkt->pts = track_pts;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
-+{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    const V4L2m2mTrackEl *const t = s->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = s->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pts;
-+        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        s->last_opaque = s->track_els[n].reordered_opaque;
-+        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    frame->best_effort_timestamp = frame->pts;
-+    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
-+    return 0;
-+}
-+
-+static inline int stream_started(const V4L2m2mContext * const s) {
-+    return s->capture.streamon && s->output.streamon;
-+}
-+
-+#define NQ_OK        0
-+#define NQ_Q_FULL    1
-+#define NQ_SRC_EMPTY 2
-+#define NQ_DRAINING  3
-+#define NQ_DEAD      4
-+
-+#define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
-+
-+// AVERROR_EOF     Flushing an already flushed stream
-+// -ve             Error (all errors except EOF are unexpected)
-+// NQ_OK (0)       OK
-+// NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
-+// NQ_SRC_EMPTY    Src empty (do not retry)
-+// NQ_DRAINING     At EOS, dQ dest until EOS there too
-+// NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
-+
-+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
- {
--    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    V4L2Context *const capture = &s->capture;
--    V4L2Context *const output = &s->output;
-     int ret;
- 
-+    // If we don't already have a coded packet - get a new one
-+    // We will already have a coded pkt if the output Q was full last time we
-+    // tried to Q it
-     if (!s->buf_pkt.size) {
-         ret = ff_decode_get_packet(avctx, &s->buf_pkt);
-+
-+        if (ret == AVERROR(EAGAIN)) {
-+            if (!stream_started(s)) {
-+                av_log(avctx, AV_LOG_TRACE, "%s: receive_frame before 1st coded packet\n", __func__);
-+                return NQ_DEAD;
-+            }
-+            return NQ_SRC_EMPTY;
-+        }
-+
-+        if (ret == AVERROR_EOF) {
-+            // EOF - enter drain mode
-+            av_log(avctx, AV_LOG_TRACE, "--- EOS req: ret=%d, size=%d, started=%d, drain=%d\n",
-+                   ret, s->buf_pkt.size, stream_started(s), s->draining);
-+            if (!stream_started(s)) {
-+                av_log(avctx, AV_LOG_DEBUG, "EOS on flushed stream\n");
-+                s->draining = 1;
-+                s->capture.done = 1;
-+                return AVERROR_EOF;
-+            }
-+
-+            if (!s->draining) {
-+                // Calling enqueue with an empty pkt starts drain
-+                av_assert0(s->buf_pkt.size == 0);
-+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
-+                if (ret) {
-+                    av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
-+                    return ret;
-+                }
-+            }
-+            return NQ_DRAINING;
-+        }
-+
-         if (ret < 0) {
--            if (ret == AVERROR(EAGAIN))
--                return ff_v4l2_context_dequeue_frame(capture, frame, 0);
--            else if (ret != AVERROR_EOF)
--                return ret;
-+            av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
-+            return ret;
-         }
-+
-+        xlat_pts_in(avctx, s, &s->buf_pkt);
-     }
- 
--    if (s->draining)
--        goto dequeue;
-+    if ((ret = check_output_streamon(avctx, s)) != 0)
-+        return ret;
- 
--    ret = ff_v4l2_context_enqueue_packet(output, &s->buf_pkt);
--    if (ret < 0 && ret != AVERROR(EAGAIN))
--        goto fail;
-+    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
-+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
-+                                         1);
- 
--    /* if EAGAIN don't unref packet and try to enqueue in the next iteration */
--    if (ret != AVERROR(EAGAIN))
-+    if (ret == AVERROR(EAGAIN)) {
-+        // Out of input buffers - keep packet
-+        ret = NQ_Q_FULL;
-+    }
-+    else {
-+        // In all other cases we are done with this packet
-         av_packet_unref(&s->buf_pkt);
-+        s->extdata_sent = 1;
- 
--    if (!s->draining) {
--        ret = v4l2_try_start(avctx);
-         if (ret) {
--            /* cant recover */
--            if (ret != AVERROR(ENOMEM))
--                ret = 0;
--            goto fail;
-+            av_log(avctx, AV_LOG_ERROR, "Packet enqueue failure: err=%d\n", ret);
-+            return ret;
-+        }
-+    }
-+
-+    // Start if we haven't
-+    {
-+        const int ret2 = v4l2_try_start(avctx);
-+        if (ret2) {
-+            av_log(avctx, AV_LOG_DEBUG, "Start failure: err=%d\n", ret2);
-+            ret = (ret2 == AVERROR(ENOMEM)) ? ret2 : NQ_DEAD;
-+        }
-+    }
-+
-+    return ret;
-+}
-+
-+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-+    int src_rv;
-+    int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-+
-+    do {
-+        src_rv = try_enqueue_src(avctx, s);
-+
-+        // If we got a frame last time and we have nothing to enqueue then
-+        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
-+        // This should mean that once decode starts we enter a stable state where
-+        // we alternately ask for input and produce output
-+        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
-+            break;
-+
-+        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
-+            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
-+            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
-+        }
-+
-+        // Try to get a new frame if
-+        // (a) we haven't already got one AND
-+        // (b) enqueue returned a status indicating that decode should be attempted
-+        if (dst_rv != 0 && TRY_DQ(src_rv)) {
-+            do {
-+                // Dequeue frame will unref any previous contents of frame
-+                // if it returns success so we don't need an explicit unref
-+                // when discarding
-+                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-+                // but there is room in the input Q
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
-+
-+                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                           s->draining, s->capture.done);
-+                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                           s->draining, s->capture.done, dst_rv);
-+
-+                // Go again if we got a frame that we need to discard
-+            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
-+        }
-+
-+        // Continue trying to enqueue packets if either
-+        // (a) we succeeded last time OR
-+        // (b) enqueue failed due to input Q full AND there is now room
-+    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
-+
-+    // Ensure that the frame contains nothing if we aren't returning a frame
-+    // (might happen when discarding)
-+    if (dst_rv)
-+        av_frame_unref(frame);
-+
-+    // If we got a frame this time ask for a pkt next time
-+    s->req_pkt = (dst_rv == 0);
-+
-+#if 0
-+    if (dst_rv == 0)
-+    {
-+        static int z = 0;
-+        if (++z > 50) {
-+            av_log(avctx, AV_LOG_ERROR, "Streamoff and die?\n");
-+            ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-+            return -1;
-         }
-     }
-+#endif
-+
-+    return dst_rv == 0 ? 0 :
-+        src_rv < 0 ? src_rv :
-+        dst_rv < 0 ? dst_rv :
-+            AVERROR(EAGAIN);
-+}
-+
-+#if 0
-+#include <time.h>
-+static int64_t us_time(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return (int64_t)ts.tv_sec * 1000000 + ts.tv_nsec / 1000;
-+}
- 
--dequeue:
--    return ff_v4l2_context_dequeue_frame(capture, frame, -1);
--fail:
--    av_packet_unref(&s->buf_pkt);
-+static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-+{
-+    int ret;
-+    const int64_t now = us_time();
-+    int64_t done;
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    ret = v4l2_receive_frame2(avctx, frame);
-+    done = us_time();
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rx time=%" PRId64 ", rv=%d\n", __func__, done - now, ret);
-     return ret;
- }
-+#endif
- 
- static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- {
-@@ -185,6 +457,9 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     V4L2m2mPriv *priv = avctx->priv_data;
-     int ret;
- 
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
-@@ -205,6 +480,28 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
- 
-+    /* the client requests the codec to generate DRM frames:
-+     *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-+     *       check the ff_v4l2_buffer_to_avframe conversion function.
-+     *   - the DRM frame format is passed in the DRM frame descriptor layer.
-+     *       check the v4l2_get_drm_frame function.
-+     */
-+    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
-+    default:
-+        s->output_drm = 1;
-+        break;
-+    }
-+
-+    s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-+    if (!s->device_ref) {
-+        ret = AVERROR(ENOMEM);
-+        return ret;
-+    }
-+
-+    ret = av_hwdevice_ctx_init(s->device_ref);
-+    if (ret < 0)
-+        return ret;
-+
-     s->avctx = avctx;
-     ret = ff_v4l2_m2m_codec_init(priv);
-     if (ret) {
-@@ -217,7 +514,53 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
- static av_cold int v4l2_decode_close(AVCodecContext *avctx)
- {
--    return ff_v4l2_m2m_codec_end(avctx->priv_data);
-+    int rv;
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    rv = ff_v4l2_m2m_codec_end(avctx->priv_data);
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s: rv=%d\n", __func__, rv);
-+    return rv;
-+}
-+
-+static void v4l2_decode_flush(AVCodecContext *avctx)
-+{
-+    // An alternatve and more drastic form of flush is to simply do this:
-+    //    v4l2_decode_close(avctx);
-+    //    v4l2_decode_init(avctx);
-+    // The downside is that this keeps a decoder open until all the frames
-+    // associated with it have been returned.  This is a bit wasteful on
-+    // possibly limited h/w resources and fails on a Pi for this reason unless
-+    // more GPU mem is allocated than is the default.
-+
-+    V4L2m2mPriv * const priv = avctx->priv_data;
-+    V4L2m2mContext * const s = priv->context;
-+    V4L2Context * const output = &s->output;
-+    V4L2Context * const capture = &s->capture;
-+    int ret, i;
-+
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
-+
-+    // Reflushing everything is benign, quick and avoids having to worry about
-+    // states like EOS processing so don't try to optimize out (having got it
-+    // wrong once)
-+
-+    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
-+    if (ret < 0)
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
-+
-+    // V4L2 makes no guarantees about whether decoded frames are flushed or not
-+    // so mark all frames we are tracking to be discarded if they appear
-+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
-+        s->track_els[i].discard = 1;
-+
-+    // resend extradata
-+    s->extdata_sent = 0;
-+    // clear EOS status vars
-+    s->draining = 0;
-+    output->done = 0;
-+    capture->done = 0;
-+
-+    // Stream on will occur when we actually submit a new frame
-+    av_log(avctx, AV_LOG_TRACE, ">>> %s\n", __func__);
- }
- 
- #define OFFSET(x) offsetof(V4L2m2mPriv, x)
-@@ -227,9 +570,15 @@ static const AVOption options[] = {
-     V4L_M2M_DEFAULT_OPTS,
-     { "num_capture_buffers", "Number of buffers in the capture context",
-         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
-+    { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
-     { NULL},
- };
- 
-+static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-+    HW_CONFIG_INTERNAL(DRM_PRIME),
-+    NULL
-+};
-+
- #define M2MDEC_CLASS(NAME) \
-     static const AVClass v4l2_m2m_ ## NAME ## _dec_class = { \
-         .class_name = #NAME "_v4l2m2m_decoder", \
-@@ -250,11 +599,16 @@ static const AVOption options[] = {
-         .init           = v4l2_decode_init, \
-         FF_CODEC_RECEIVE_FRAME_CB(v4l2_receive_frame), \
-         .close          = v4l2_decode_close, \
-+        .flush          = v4l2_decode_flush, \
-         .bsfs           = bsf_name, \
-         .p.capabilities = AV_CODEC_CAP_HARDWARE | AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AVOID_PROBING, \
-         .caps_internal  = FF_CODEC_CAP_NOT_INIT_THREADSAFE | \
-                           FF_CODEC_CAP_SETS_PKT_DTS | FF_CODEC_CAP_INIT_CLEANUP, \
-         .p.wrapper_name = "v4l2m2m", \
-+        .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-+                                                         AV_PIX_FMT_NV12, \
-+                                                         AV_PIX_FMT_NONE}, \
-+        .hw_configs     = v4l2_m2m_hw_configs, \
-     }
- 
- M2MDEC(h264,  "H.264", AV_CODEC_ID_H264,       "h264_mp4toannexb");
-
-From 12f8f12326b83dd3c22084f8922705d79a13d195 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 10 Jun 2021 18:46:21 +0100
-Subject: [PATCH 017/136] Fix crash in hw_device_default_name if type not found
- (NONE)
-
----
- fftools/ffmpeg_hw.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/fftools/ffmpeg_hw.c b/fftools/ffmpeg_hw.c
-index 88fa782470..740a5e7153 100644
---- a/fftools/ffmpeg_hw.c
-+++ b/fftools/ffmpeg_hw.c
-@@ -75,6 +75,8 @@ static char *hw_device_default_name(enum AVHWDeviceType type)
-     char *name;
-     size_t index_pos;
-     int index, index_limit = 1000;
-+    if (!type_name)
-+        return NULL;
-     index_pos = strlen(type_name);
-     name = av_malloc(index_pos + 4);
-     if (!name)
-
-From 7f6bce459e683bff3a0b972922fbcc808e9177a6 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 10 Jun 2021 18:59:18 +0100
-Subject: [PATCH 018/136] Allow v4l2m2m to select non-drm_prime output formats
-
----
- libavcodec/v4l2_buffers.c |  2 +-
- libavcodec/v4l2_m2m_dec.c | 14 ++++++++++----
- 2 files changed, 11 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index a003934ca1..1ca1128db6 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -524,7 +524,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-                 offset += dst_stride * out->context->height;
-             }
-             if (offset > out->plane_info[0].length) {
--                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %d > buffer size %d\n", __func__, offset, out->plane_info[0].length);
-+                av_log(NULL, AV_LOG_ERROR, "%s: Plane total %u > buffer size %zu\n", __func__, offset, out->plane_info[0].length);
-                 return -1;
-             }
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 7f6033ac2c..a4b5a4e7e9 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -455,10 +455,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     V4L2Context *capture, *output;
-     V4L2m2mContext *s;
-     V4L2m2mPriv *priv = avctx->priv_data;
-+    int gf_pix_fmt;
-     int ret;
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
--    avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
- 
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-@@ -486,10 +486,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-      *   - the DRM frame format is passed in the DRM frame descriptor layer.
-      *       check the v4l2_get_drm_frame function.
-      */
--    switch (ff_get_format(avctx, avctx->codec->pix_fmts)) {
--    default:
-+
-+    gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-+
-+    s->output_drm = 0;
-+    if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-+        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-         s->output_drm = 1;
--        break;
-     }
- 
-     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-@@ -607,6 +612,7 @@ static const AVCodecHWConfigInternal *v4l2_m2m_hw_configs[] = {
-         .p.wrapper_name = "v4l2m2m", \
-         .p.pix_fmts     = (const enum AVPixelFormat[]) { AV_PIX_FMT_DRM_PRIME, \
-                                                          AV_PIX_FMT_NV12, \
-+                                                         AV_PIX_FMT_YUV420P, \
-                                                          AV_PIX_FMT_NONE}, \
-         .hw_configs     = v4l2_m2m_hw_configs, \
-     }
-
-From 9b0d964b727d98271f7f2f4dcdbcb1b41a429e2b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 10 Jun 2021 18:59:38 +0100
-Subject: [PATCH 019/136] Fix YUV420P output from v4l2m2m
-
-Also put get_width get_height inlines in header as they are generally
-useful.
----
- libavcodec/v4l2_buffers.c | 12 ++++++------
- libavcodec/v4l2_context.c | 22 ++++++----------------
- libavcodec/v4l2_m2m.h     | 12 ++++++++++++
- 3 files changed, 24 insertions(+), 22 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 1ca1128db6..f4c11ca8d0 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -425,17 +425,17 @@ static int v4l2_buffer_buf_to_swframe(AVFrame *frame, V4L2Buffer *avbuf)
-     case AV_PIX_FMT_NV21:
-         if (avbuf->num_planes > 1)
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
-+        frame->linesize[1] = frame->linesize[0];
-+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
-         break;
- 
-     case AV_PIX_FMT_YUV420P:
-         if (avbuf->num_planes > 1)
-             break;
--        frame->linesize[1] = avbuf->plane_info[0].bytesperline >> 1;
--        frame->linesize[2] = avbuf->plane_info[0].bytesperline >> 1;
--        frame->data[1] = frame->buf[0]->data + avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height;
--        frame->data[2] = frame->data[1] + ((avbuf->plane_info[0].bytesperline * avbuf->context->format.fmt.pix_mp.height) >> 2);
-+        frame->linesize[1] = frame->linesize[0] / 2;
-+        frame->linesize[2] = frame->linesize[1];
-+        frame->data[1] = frame->data[0] + frame->linesize[0] * ff_v4l2_get_format_height(&avbuf->context->format);
-+        frame->data[2] = frame->data[1] + frame->linesize[1] * ff_v4l2_get_format_height(&avbuf->context->format) / 2;
-         break;
- 
-     default:
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index be76068af3..6fe2586627 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -55,16 +55,6 @@ static inline AVCodecContext *logger(V4L2Context *ctx)
-     return ctx_to_m2mctx(ctx)->avctx;
- }
- 
--static inline unsigned int v4l2_get_width(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
--}
--
--static inline unsigned int v4l2_get_height(struct v4l2_format *fmt)
--{
--    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
--}
--
- static AVRational v4l2_get_sar(V4L2Context *ctx)
- {
-     struct AVRational sar = { 0, 1 };
-@@ -96,8 +86,8 @@ static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2
-     if (ret)
-         av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
-             ctx->name,
--            v4l2_get_width(fmt1), v4l2_get_height(fmt1),
--            v4l2_get_width(fmt2), v4l2_get_height(fmt2));
-+            ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
-+            ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
- 
-     return ret;
- }
-@@ -195,8 +185,8 @@ static int do_source_change(V4L2m2mContext * const s)
- 
-     reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
-     if (reinit) {
--        s->capture.height = v4l2_get_height(&cap_fmt);
--        s->capture.width = v4l2_get_width(&cap_fmt);
-+        s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
-+        s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
-     }
-     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
- 
-@@ -973,8 +963,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
-     av_log(logger(ctx), AV_LOG_DEBUG, "%s: %s %02d buffers initialized: %04ux%04u, sizeimage %08u, bytesperline %08u\n", ctx->name,
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? av_fourcc2str(ctx->format.fmt.pix_mp.pixelformat) : av_fourcc2str(ctx->format.fmt.pix.pixelformat),
-         req.count,
--        v4l2_get_width(&ctx->format),
--        v4l2_get_height(&ctx->format),
-+        ff_v4l2_get_format_width(&ctx->format),
-+        ff_v4l2_get_format_height(&ctx->format),
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage : ctx->format.fmt.pix.sizeimage,
-         V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ? ctx->format.fmt.pix_mp.plane_fmt[0].bytesperline : ctx->format.fmt.pix.bytesperline);
- 
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 24a9c94864..8f054f2f50 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -160,4 +160,16 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
-  */
- int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
- 
-+
-+static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-+}
-+
-+static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-+}
-+
-+
- #endif /* AVCODEC_V4L2_M2M_H */
-
-From 14e9b4bf1b34b3d1e1e6a4fc755cc595416e7d7b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 10 Jun 2021 19:23:44 +0100
-Subject: [PATCH 020/136] Report buffer overflows in v4l2m2m
-
----
- libavcodec/v4l2_buffers.c | 14 ++++++++++----
- libavcodec/v4l2_context.c |  5 ++++-
- 2 files changed, 14 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index f4c11ca8d0..de31f7ced9 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -364,6 +364,7 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
- static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, int size, int offset)
- {
-     unsigned int bytesused, length;
-+    int rv = 0;
- 
-     if (plane >= out->num_planes)
-         return AVERROR(EINVAL);
-@@ -371,11 +372,16 @@ static int v4l2_bufref_to_buf(V4L2Buffer *out, int plane, const uint8_t* data, i
-     length = out->plane_info[plane].length;
-     bytesused = FFMIN(size+offset, length);
- 
--    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, FFMIN(size, length-offset));
-+    if (size > length - offset) {
-+        size = length - offset;
-+        rv = AVERROR(ENOMEM);
-+    }
-+
-+    memcpy((uint8_t*)out->plane_info[plane].mm_addr+offset, data, size);
- 
-     set_buf_length(out, plane, bytesused, length);
- 
--    return 0;
-+    return rv;
- }
- 
- static AVBufferRef * wrap_avbuf(V4L2Buffer * const avbuf)
-@@ -630,7 +636,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-     }
- 
-     ret = v4l2_bufref_to_buf(out, 0, pkt->data, pkt->size, extlen);
--    if (ret)
-+    if (ret && ret != AVERROR(ENOMEM))
-         return ret;
- 
-     v4l2_set_pts(out, pkt->pts, no_rescale_pts);
-@@ -638,7 +644,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-     if (pkt->flags & AV_PKT_FLAG_KEY)
-         out->flags = V4L2_BUF_FLAG_KEYFRAME;
- 
--    return 0;
-+    return ret;
- }
- 
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 6fe2586627..81aced0c2b 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -824,7 +824,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-         return AVERROR(EAGAIN);
- 
-     ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
--    if (ret)
-+    if (ret == AVERROR(ENOMEM))
-+        av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
-+               __func__, pkt->size, avbuf->planes[0].length);
-+    else if (ret)
-         return ret;
- 
-     return ff_v4l2_buffer_enqueue(avbuf);
-
-From 072907a7fcf160d12972997d24fdf62641687ea4 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 14 Jun 2021 11:55:16 +0100
-Subject: [PATCH 021/136] Increase V4L2 H264 stateful coded buffer size
-
-Try to set a min size of frame size / 2 for bitbuffers passed to V4l2.
-This fixes a few streams that have large I-frames.  You would hope
-Annex-A gave useful minCR so an appropriate size could be calculated
-but it doesn't really.  It gives good guidance for bits required over
-time but the instantaneous limits are very weak so it is possible
-that even this won't be enough.  The correct long term solution would
-be to have resizable dmabufs but that is a greter rewrite than seems
-sensible now.
----
- libavcodec/v4l2_context.c | 24 +++++++++++++++++++++++-
- libavcodec/v4l2_context.h |  6 ++++++
- libavcodec/v4l2_m2m_dec.c | 24 ++++++++++++++++++++++++
- 3 files changed, 53 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 81aced0c2b..a17ae027a6 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -902,7 +902,29 @@ int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
- 
- int ff_v4l2_context_set_format(V4L2Context* ctx)
- {
--    return ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    int ret;
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    if (ret != 0)
-+        return ret;
-+
-+    // Check returned size against min size and if smaller have another go
-+    // Only worry about plane[0] as this is meant to enforce limits for
-+    // encoded streams where we might know a bit more about the shape
-+    // than the driver
-+    if (V4L2_TYPE_IS_MULTIPLANAR(ctx->format.type)) {
-+        if (ctx->min_buf_size <= ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage)
-+            return 0;
-+        ctx->format.fmt.pix_mp.plane_fmt[0].sizeimage = ctx->min_buf_size;
-+    }
-+    else {
-+        if (ctx->min_buf_size <= ctx->format.fmt.pix.sizeimage)
-+            return 0;
-+        ctx->format.fmt.pix.sizeimage = ctx->min_buf_size;
-+    }
-+
-+    ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_S_FMT, &ctx->format);
-+    return ret;
- }
- 
- void ff_v4l2_context_release(V4L2Context* ctx)
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 59009d11d1..37b0431400 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -75,6 +75,12 @@ typedef struct V4L2Context {
-     AVRational sample_aspect_ratio;
-     struct v4l2_rect selection;
- 
-+    /**
-+     * If the default size of buffer is less than this then try to
-+     * set to this.
-+     */
-+    uint32_t min_buf_size;
-+
-     /**
-      * Indexed array of pointers to V4L2Buffers
-      */
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index a4b5a4e7e9..1851acbc93 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -450,6 +450,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- }
- #endif
- 
-+static uint32_t max_coded_size(const AVCodecContext * const avctx)
-+{
-+    uint32_t wxh = avctx->coded_width * avctx->coded_height;
-+    uint32_t size;
-+
-+    // Currently the only thing we try to set our own limits for is H264
-+    if (avctx->codec_id != AV_CODEC_ID_H264)
-+        return 0;
-+
-+    size = wxh * 3 / 2;
-+    // H.264 Annex A table A-1 gives minCR which is either 2 or 4
-+    // unfortunately that doesn't yield an actually useful limit
-+    // and it should be noted that frame 0 is special cased to allow
-+    // a bigger number which really isn't helpful for us. So just pick
-+    // frame_size / 2
-+    size /= 2;
-+    // Add 64k to allow for any overheads and/or encoder hopefulness
-+    // with small WxH
-+    return size + (1 << 16);
-+}
-+
- static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- {
-     V4L2Context *capture, *output;
-@@ -460,6 +481,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
- 
-+    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
-@@ -476,9 +498,11 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
-     output->av_codec_id = avctx->codec_id;
-     output->av_pix_fmt  = AV_PIX_FMT_NONE;
-+    output->min_buf_size = max_coded_size(avctx);
- 
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
-+    capture->min_buf_size = 0;
- 
-     /* the client requests the codec to generate DRM frames:
-      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-
-From 6087c8c054e1ff3d2e6e62d5e32705d079928b64 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 28 Jun 2021 12:13:35 +0100
-Subject: [PATCH 022/136] Fix raw video s.t. it respects any remaining cropping
-
-This fixes the long standing CONFWIN_A conformance test failure for drm.
----
- libavcodec/rawenc.c       |  32 ++++++++---
- libavutil/hwcontext_drm.c | 112 ++++++++++++++++++++++++++++++++++++--
- 2 files changed, 130 insertions(+), 14 deletions(-)
-
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index 594a77c42a..8ca0379e12 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -124,32 +124,41 @@ static int raw_sand30_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
- 
- 
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
--                      const AVFrame *frame, int *got_packet)
-+                      const AVFrame *src_frame, int *got_packet)
- {
-     int ret;
-+    AVFrame * frame = NULL;
- 
- #if CONFIG_SAND
--    if (av_rpi_is_sand_frame(frame)) {
--        ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) :
--            av_rpi_is_sand16_frame(frame) ? raw_sand16_as_yuv420(avctx, pkt, frame) :
--            av_rpi_is_sand30_frame(frame) ? raw_sand30_as_yuv420(avctx, pkt, frame) : -1;
-+    if (av_rpi_is_sand_frame(src_frame)) {
-+        ret = av_rpi_is_sand8_frame(src_frame) ? raw_sand8_as_yuv420(avctx, pkt, src_frame) :
-+            av_rpi_is_sand16_frame(src_frame) ? raw_sand16_as_yuv420(avctx, pkt, src_frame) :
-+            av_rpi_is_sand30_frame(src_frame) ? raw_sand30_as_yuv420(avctx, pkt, src_frame) : -1;
-         *got_packet = (ret == 0);
-         return ret;
-     }
- #endif
- 
-+    if ((frame = av_frame_clone(src_frame)) == NULL) {
-+        ret = AVERROR(ENOMEM);
-+        goto fail;
-+    }
-+
-+    if ((ret = av_frame_apply_cropping(frame, AV_FRAME_CROP_UNALIGNED)) < 0)
-+        goto fail;
-+
-     ret = av_image_get_buffer_size(frame->format,
-                                        frame->width, frame->height, 1);
-     if (ret < 0)
--        return ret;
-+        goto fail;
- 
-     if ((ret = ff_get_encode_buffer(avctx, pkt, ret, 0)) < 0)
--        return ret;
-+        goto fail;
-     if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-                                        (const uint8_t **)frame->data, frame->linesize,
-                                        frame->format,
-                                        frame->width, frame->height, 1)) < 0)
--        return ret;
-+        goto fail;
- 
-     if(avctx->codec_tag == AV_RL32("yuv2") && ret > 0 &&
-        frame->format   == AV_PIX_FMT_YUYV422) {
-@@ -165,8 +174,15 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
-             AV_WB64(&pkt->data[8 * x], v << 48 | v >> 16);
-         }
-     }
-+    pkt->flags |= AV_PKT_FLAG_KEY;
-+    av_frame_free(&frame);
-     *got_packet = 1;
-     return 0;
-+
-+fail:
-+    av_frame_free(&frame);
-+    *got_packet = 0;
-+    return ret;
- }
- 
- const FFCodec ff_rawvideo_encoder = {
-diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
-index 7a9fdbd263..baf18920fa 100644
---- a/libavutil/hwcontext_drm.c
-+++ b/libavutil/hwcontext_drm.c
-@@ -21,6 +21,7 @@
- #include <fcntl.h>
- #include <sys/mman.h>
- #include <unistd.h>
-+#include <sys/ioctl.h>
- 
- /* This was introduced in version 4.6. And may not exist all without an
-  * optional package. So to prevent a hard dependency on needing the Linux
-@@ -31,6 +32,7 @@
- #endif
- 
- #include <drm.h>
-+#include <libdrm/drm_fourcc.h>
- #include <xf86drm.h>
- 
- #include "avassert.h"
-@@ -38,7 +40,9 @@
- #include "hwcontext_drm.h"
- #include "hwcontext_internal.h"
- #include "imgutils.h"
--
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
- 
- static void drm_device_free(AVHWDeviceContext *hwdev)
- {
-@@ -53,6 +57,11 @@ static int drm_device_create(AVHWDeviceContext *hwdev, const char *device,
-     AVDRMDeviceContext *hwctx = hwdev->hwctx;
-     drmVersionPtr version;
- 
-+    if (device == NULL) {
-+        hwctx->fd = -1;
-+        return 0;
-+    }
-+
-     hwctx->fd = open(device, O_RDWR);
-     if (hwctx->fd < 0)
-         return AVERROR(errno);
-@@ -139,6 +148,8 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
-     if (flags & AV_HWFRAME_MAP_WRITE)
-         mmap_prot |= PROT_WRITE;
- 
-+    if (dst->format == AV_PIX_FMT_NONE)
-+        dst->format = hwfc->sw_format;
- #if HAVE_LINUX_DMA_BUF_H
-     if (flags & AV_HWFRAME_MAP_READ)
-         map->sync_flags |= DMA_BUF_SYNC_READ;
-@@ -185,6 +196,23 @@ static int drm_map_frame(AVHWFramesContext *hwfc,
- 
-     dst->width  = src->width;
-     dst->height = src->height;
-+    dst->crop_top    = src->crop_top;
-+    dst->crop_bottom = src->crop_bottom;
-+    dst->crop_left   = src->crop_left;
-+    dst->crop_right  = src->crop_right;
-+
-+#if CONFIG_SAND
-+    // Rework for sand frames
-+    if (av_rpi_is_sand_frame(dst)) {
-+        // As it stands the sand formats hold stride2 in linesize[3]
-+        // linesize[0] & [1] contain stride1 which is always 128 for everything we do
-+        // * Arguably this should be reworked s.t. stride2 is in linesize[0] & [1]
-+        dst->linesize[3] = fourcc_mod_broadcom_param(desc->objects[0].format_modifier);
-+        dst->linesize[0] = 128;
-+        dst->linesize[1] = 128;
-+        // *** Are we sure src->height is actually what we want ???
-+    }
-+#endif
- 
-     err = ff_hwframe_map_create(src->hw_frames_ctx, dst, src,
-                                 &drm_unmap_frame, map);
-@@ -212,7 +240,15 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
-     if (!pix_fmts)
-         return AVERROR(ENOMEM);
- 
--    pix_fmts[0] = ctx->sw_format;
-+    // **** Offer native sand too ????
-+    pix_fmts[0] =
-+#if CONFIG_SAND
-+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
-+            AV_PIX_FMT_YUV420P :
-+        ctx->sw_format == AV_PIX_FMT_RPI4_10 ?
-+            AV_PIX_FMT_YUV420P10LE :
-+#endif
-+            ctx->sw_format;
-     pix_fmts[1] = AV_PIX_FMT_NONE;
- 
-     *formats = pix_fmts;
-@@ -231,18 +267,79 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
-     map = av_frame_alloc();
-     if (!map)
-         return AVERROR(ENOMEM);
--    map->format = dst->format;
- 
-+    // Map to default
-+    map->format = AV_PIX_FMT_NONE;
-     err = drm_map_frame(hwfc, map, src, AV_HWFRAME_MAP_READ);
-     if (err)
-         goto fail;
- 
--    map->width  = dst->width;
--    map->height = dst->height;
-+#if 0
-+    av_log(hwfc, AV_LOG_INFO, "%s: src fmt=%d (%d), dst fmt=%d (%d) s=%dx%d l=%d/%d/%d/%d, d=%dx%d l=%d/%d/%d\n", __func__,
-+           hwfc->sw_format, AV_PIX_FMT_RPI4_8, dst->format, AV_PIX_FMT_YUV420P10LE,
-+           map->width, map->height,
-+           map->linesize[0],
-+           map->linesize[1],
-+           map->linesize[2],
-+           map->linesize[3],
-+           dst->width, dst->height,
-+           dst->linesize[0],
-+           dst->linesize[1],
-+           dst->linesize[2]);
-+#endif
-+#if CONFIG_SAND
-+    if (av_rpi_is_sand_frame(map)) {
-+        // Preserve crop - later ffmpeg code assumes that we have in that it
-+        // overwrites any crop that we create with the old values
-+        const unsigned int w = FFMIN(dst->width, map->width);
-+        const unsigned int h = FFMIN(dst->height, map->height);
-+
-+        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
-+            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
-+            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
-+                                     map->data[0],
-+                                     128, stride2,
-+                                     0, 0, w, h);
-+            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
-+                                     dst->data[2], dst->linesize[2],
-+                                     map->data[1],
-+                                     128, stride2,
-+                                     0, 0, w / 2, h / 2);
-+        }
-+        else
-+        {
-+            av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
-+            err = AVERROR(EINVAL);
-+            goto fail;
-+        }
-+
-+        dst->width = w;
-+        dst->height = h;
-+    }
-+    else
-+#endif
-+    {
-+        // Kludge mapped h/w s.t. frame_copy works
-+        map->width  = dst->width;
-+        map->height = dst->height;
-+        err = av_frame_copy(dst, map);
-+    }
- 
--    err = av_frame_copy(dst, map);
-     if (err)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: Copy fail\n", __func__);
-         goto fail;
-+    }
- 
-     err = 0;
- fail:
-@@ -257,7 +354,10 @@ static int drm_transfer_data_to(AVHWFramesContext *hwfc,
-     int err;
- 
-     if (src->width > hwfc->width || src->height > hwfc->height)
-+    {
-+        av_log(hwfc, AV_LOG_ERROR, "%s: H/w mismatch: %d/%d, %d/%d\n", __func__, dst->width, hwfc->width, dst->height, hwfc->height);
-         return AVERROR(EINVAL);
-+    }
- 
-     map = av_frame_alloc();
-     if (!map)
-
-From 597858c11fbfbe0f54c1b68d9683025929258bc1 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 13 Aug 2021 15:38:28 +0100
-Subject: [PATCH 023/136] Set frame interlace from V4L2 buffer field
-
----
- libavcodec/v4l2_buffers.c | 12 ++++++++++++
- 1 file changed, 12 insertions(+)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index de31f7ced9..97b8eb1db3 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -222,6 +222,16 @@ static enum AVColorTransferCharacteristic v4l2_get_color_trc(V4L2Buffer *buf)
-     return AVCOL_TRC_UNSPECIFIED;
- }
- 
-+static int v4l2_buf_is_interlaced(const V4L2Buffer * const buf)
-+{
-+    return V4L2_FIELD_IS_INTERLACED(buf->buf.field);
-+}
-+
-+static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
-+{
-+    return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
-+}
-+
- static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
- {
-     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-@@ -576,6 +586,8 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
-     frame->color_trc = v4l2_get_color_trc(avbuf);
-     frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
-     frame->pkt_dts = AV_NOPTS_VALUE;
-+    frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
-+    frame->top_field_first = v4l2_buf_is_top_first(avbuf);
- 
-     /* these values are updated also during re-init in v4l2_process_driver_event */
-     frame->height = ctx->height;
-
-From 05906e2086b5087d615485ec9a09b1493dbb32e1 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 13 Aug 2021 16:11:53 +0100
-Subject: [PATCH 024/136] Fix V4L2 stateful to avoid crash if flush before
- start
-
----
- libavcodec/v4l2_context.c | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index a17ae027a6..eb901e8fab 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -713,6 +713,10 @@ static int v4l2_get_coded_format(V4L2Context* ctx, uint32_t *p)
- static void flush_all_buffers_status(V4L2Context* const ctx)
- {
-     int i;
-+
-+    if (!ctx->bufrefs)
-+        return;
-+
-     for (i = 0; i < ctx->num_buffers; ++i) {
-         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-         if (buf->status == V4L2BUF_IN_DRIVER)
-
-From 7157b6032e759078a7d751e5dd5762970f3d1e8c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 9 Sep 2021 17:44:13 +0100
-Subject: [PATCH 025/136] Copy properties from frame to v4l2 buffer
-
-Now copies all the properties in ff_v4l2_buffer_avframe_to_buf that
-ff_v4l2_buffer_buf_to_avframe copies
----
- libavcodec/v4l2_buffers.c | 126 ++++++++++++++++++++++++++++++++++++++
- 1 file changed, 126 insertions(+)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 97b8eb1db3..126d2a17f4 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -128,6 +128,105 @@ static enum AVColorPrimaries v4l2_get_color_primaries(V4L2Buffer *buf)
-     return AVCOL_PRI_UNSPECIFIED;
- }
- 
-+static void v4l2_set_color(V4L2Buffer *buf,
-+                           const enum AVColorPrimaries avcp,
-+                           const enum AVColorSpace avcs,
-+                           const enum AVColorTransferCharacteristic avxc)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
-+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
-+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
-+
-+    switch (avcp) {
-+    case AVCOL_PRI_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        ycbcr = V4L2_YCBCR_ENC_709;
-+        break;
-+    case AVCOL_PRI_BT470M:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        ycbcr = V4L2_YCBCR_ENC_601;
-+        break;
-+    case AVCOL_PRI_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_PRI_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_PRI_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_PRI_BT2020:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    case AVCOL_PRI_SMPTE428:
-+    case AVCOL_PRI_SMPTE431:
-+    case AVCOL_PRI_SMPTE432:
-+    case AVCOL_PRI_EBU3213:
-+    case AVCOL_PRI_RESERVED:
-+    case AVCOL_PRI_FILM:
-+    case AVCOL_PRI_UNSPECIFIED:
-+    default:
-+        break;
-+    }
-+
-+    switch (avcs) {
-+    case AVCOL_SPC_RGB:
-+        cs = V4L2_COLORSPACE_SRGB;
-+        break;
-+    case AVCOL_SPC_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        break;
-+    case AVCOL_SPC_FCC:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        break;
-+    case AVCOL_SPC_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_SPC_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_SPC_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_SPC_BT2020_CL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
-+        break;
-+    case AVCOL_SPC_BT2020_NCL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    switch (xfer) {
-+    case AVCOL_TRC_BT709:
-+        xfer = V4L2_XFER_FUNC_709;
-+        break;
-+    case AVCOL_TRC_IEC61966_2_1:
-+        xfer = V4L2_XFER_FUNC_SRGB;
-+        break;
-+    case AVCOL_TRC_SMPTE240M:
-+        xfer = V4L2_XFER_FUNC_SMPTE240M;
-+        break;
-+    case AVCOL_TRC_SMPTE2084:
-+        xfer = V4L2_XFER_FUNC_SMPTE2084;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
-+        buf->context->format.fmt.pix_mp.colorspace = cs;
-+        buf->context->format.fmt.pix_mp.ycbcr_enc = ycbcr;
-+        buf->context->format.fmt.pix_mp.xfer_func = xfer;
-+    } else {
-+        buf->context->format.fmt.pix.colorspace = cs;
-+        buf->context->format.fmt.pix.ycbcr_enc = ycbcr;
-+        buf->context->format.fmt.pix.xfer_func = xfer;
-+    }
-+}
-+
- static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
- {
-     enum v4l2_quantization qt;
-@@ -146,6 +245,20 @@ static enum AVColorRange v4l2_get_color_range(V4L2Buffer *buf)
-      return AVCOL_RANGE_UNSPECIFIED;
- }
- 
-+static void v4l2_set_color_range(V4L2Buffer *buf, const enum AVColorRange avcr)
-+{
-+    const enum v4l2_quantization q =
-+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
-+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
-+            V4L2_QUANTIZATION_DEFAULT;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buf.type)) {
-+        buf->context->format.fmt.pix_mp.quantization = q;
-+    } else {
-+        buf->context->format.fmt.pix.quantization = q;
-+    }
-+}
-+
- static enum AVColorSpace v4l2_get_color_space(V4L2Buffer *buf)
- {
-     enum v4l2_ycbcr_encoding ycbcr;
-@@ -232,6 +345,12 @@ static int v4l2_buf_is_top_first(const V4L2Buffer * const buf)
-     return buf->buf.field == V4L2_FIELD_INTERLACED_TB;
- }
- 
-+static void v4l2_set_interlace(V4L2Buffer * const buf, const int is_interlaced, const int is_tff)
-+{
-+    buf->buf.field = !is_interlaced ? V4L2_FIELD_NONE :
-+        is_tff ? V4L2_FIELD_INTERLACED_TB : V4L2_FIELD_INTERLACED_BT;
-+}
-+
- static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
- {
-     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-@@ -561,7 +680,14 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- 
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
-+    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
-+    // Beware that colour info is held in format rather than the actual
-+    // v4l2 buffer struct so this may not be as useful as you might hope
-+    v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
-+    v4l2_set_color_range(out, frame->color_range);
-+    // PTS & interlace are buffer vars
-     v4l2_set_pts(out, frame->pts, 0);
-+    v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
- 
-     return v4l2_buffer_swframe_to_buf(frame, out);
- }
-
-From 15415ab226f966fd12e70d79fde3cb80f3d09144 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 17 Nov 2021 16:49:01 +0000
-Subject: [PATCH 026/136] ffmpeg: Do not inc DTS on no decode output
-
-V4L2 H264 decode has long latency and sometimes spits out a long stream
-of output without input. In this case incrementing DTS is wrong. There
-may be cases where the condition as written is correct so only "fix" in
-the cases which cause problems
----
- fftools/ffmpeg.c | 7 ++++++-
- 1 file changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index 5dc2cd73c1..ba0c1898cf 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -2609,7 +2609,12 @@ static int process_input_packet(InputStream *ist, const AVPacket *pkt, int no_eo
-         case AVMEDIA_TYPE_VIDEO:
-             ret = decode_video    (ist, repeating ? NULL : avpkt, &got_output, &duration_pts, !pkt,
-                                    &decode_failed);
--            if (!repeating || !pkt || got_output) {
-+            // Pi: Do not inc dts if no_cvt_hw set
-+            // V4L2 H264 decode has long latency and sometimes spits out a long
-+            // stream of output without input. In this case incrementing DTS is wrong.
-+            // There may be cases where the condition as written is correct so only
-+            // "fix" in the cases which cause problems
-+            if (!repeating || !pkt || (got_output && !no_cvt_hw)) {
-                 if (pkt && pkt->duration) {
-                     duration_dts = av_rescale_q(pkt->duration, ist->st->time_base, AV_TIME_BASE_Q);
-                 } else if(ist->dec_ctx->framerate.num != 0 && ist->dec_ctx->framerate.den != 0) {
-
-From 7bf6c062ed8a1e635aa5722c0072724f236daf00 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 17 Nov 2021 17:32:59 +0000
-Subject: [PATCH 027/136] v4l2_m2m_dec: Adjust timebase if H264
-
-Adjust AVCodecContext time_base if H264 in the same way that the
-software decoder does.
----
- libavcodec/v4l2_m2m_dec.c | 10 ++++++++++
- 1 file changed, 10 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 1851acbc93..aa1e5c1597 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -481,6 +481,16 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
- 
-+    if (avctx->codec_id == AV_CODEC_ID_H264) {
-+        if (avctx->ticks_per_frame == 1) {
-+            if(avctx->time_base.den < INT_MAX/2) {
-+                avctx->time_base.den *= 2;
-+            } else
-+                avctx->time_base.num /= 2;
-+        }
-+        avctx->ticks_per_frame = 2;
-+    }
-+
-     av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-
-From 3cd23a761397ae75ed032c1687da5d6b76ddaaaa Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 17 Nov 2021 17:38:27 +0000
-Subject: [PATCH 028/136] v4l2_m2m_dec: Produce best guess PTSs if none
- supplied
-
-Filter scheduling gets confused by missing PTSs and makes poor guesses
-more often than not.  Try to generate plausible timestamps where we are
-missing them.
----
- libavcodec/v4l2_m2m.h     | 12 ++++++++
- libavcodec/v4l2_m2m_dec.c | 64 +++++++++++++++++++++++++++++++++++++--
- 2 files changed, 74 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 8f054f2f50..82feb0afdb 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -52,6 +52,16 @@ typedef struct V4L2m2mTrackEl {
-     int64_t track_pts;
- } V4L2m2mTrackEl;
- 
-+typedef struct pts_stats_s
-+{
-+    void * logctx;
-+    const char * name;  // For debug
-+    unsigned int last_count;
-+    unsigned int last_interval;
-+    int64_t last_pts;
-+    int64_t guess;
-+} pts_stats_t;
-+
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-     int fd;
-@@ -91,6 +101,8 @@ typedef struct V4L2m2mContext {
-     unsigned int track_no;
-     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
- 
-+    pts_stats_t pts_stat;
-+
-     /* req pkt */
-     int req_pkt;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index aa1e5c1597..a5a2afbd27 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -42,6 +42,62 @@
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
- 
-+// Pick 64 for max last count - that is >1sec at 60fps
-+#define STATS_LAST_COUNT_MAX 64
-+#define STATS_INTERVAL_MAX (1 << 30)
-+
-+static int64_t pts_stats_guess(const pts_stats_t * const stats)
-+{
-+    if (stats->last_pts == AV_NOPTS_VALUE ||
-+            stats->last_interval == 0 ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX)
-+        return AV_NOPTS_VALUE;
-+    return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
-+}
-+
-+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
-+{
-+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
-+        if (stats->last_count < STATS_LAST_COUNT_MAX)
-+            ++stats->last_count;
-+        return;
-+    }
-+
-+    if (stats->last_pts != AV_NOPTS_VALUE) {
-+        const int64_t interval = pts - stats->last_pts;
-+
-+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX) {
-+            if (stats->last_interval != 0)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
-+                       __func__, stats->name, interval, stats->last_count);
-+            stats->last_interval = 0;
-+        }
-+        else {
-+            const int64_t frame_time = interval / (int64_t)stats->last_count;
-+
-+            if (frame_time != stats->last_interval)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
-+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
-+            stats->last_interval = frame_time;
-+        }
-+    }
-+
-+    stats->last_pts = pts;
-+    stats->last_count = 1;
-+}
-+
-+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
-+{
-+    *stats = (pts_stats_t){
-+        .logctx = logctx,
-+        .name = name,
-+        .last_count = 1,
-+        .last_interval = 0,
-+        .last_pts = AV_NOPTS_VALUE
-+    };
-+}
-+
- static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
- {
-     int ret;
-@@ -244,9 +300,11 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
-         return -1;
-     }
- 
--    frame->best_effort_timestamp = frame->pts;
-+    pts_stats_add(&s->pts_stat, frame->pts);
-+
-+    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
-     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
--    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 ", DTS=%" PRId64 "\n", frame->pts, frame->pkt_dts);
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
-     return 0;
- }
- 
-@@ -496,6 +554,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if (ret < 0)
-         return ret;
- 
-+    pts_stats_init(&s->pts_stat, avctx, "decoder");
-+
-     capture = &s->capture;
-     output = &s->output;
- 
-
-From ee8be1e900f98212b6c4940980cc7a80becfc07c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 17 Nov 2021 17:59:27 +0000
-Subject: [PATCH 029/136] v4l2_m2m_dec: Try harder to get an initial frame
-
-If the input Q is full then wait on a short timeout for a capture frame
-rather than stuffing yet still another frame into the input if we could
-do that first. This attempts to restrict the sometimes daft initial
-buffering that ends up confusing the rest of the system.
----
- libavcodec/v4l2_context.c | 2 +-
- libavcodec/v4l2_m2m_dec.c | 2 +-
- 2 files changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index eb901e8fab..ee5dc7b8d4 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -381,7 +381,7 @@ static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
- start:
-     if (is_capture) {
-         /* no need to listen to requests for more input while draining */
--        if (ctx_to_m2mctx(ctx)->draining)
-+        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
-             pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
-     } else {
-         pfd.events =  POLLOUT | POLLWRNORM;
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index a5a2afbd27..b49f470c0a 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -442,7 +442,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 // when discarding
-                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-                 // but there is room in the input Q
--                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, -1, 1);
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
- 
-                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-
-From 72da14331c2160a12b69d666d493e0e74c5e8914 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 17 Nov 2021 18:04:56 +0000
-Subject: [PATCH 030/136] Add a V4L2 M2M deinterlace filter
-
-Add a V4L2 deinterlace filter that will accept DRMPRIME frames.
-
-Multiple people have contributed to this:
-Jernej Skrabec <jernej.skrabec@siol.net>
-Alex Bee <knaerzche@gmail.com>
-popcornmix <popcornmix@gmail.com>
-John Cox <jc@kynesim.co.uk>
-
-There is an unknown delay through the filter of typically one or three
-fields which translates to 1 or 2 frames. Frames that are delayed are
-lost at end of stream as the V4L2 filter has no flush control.
----
- libavcodec/v4l2_context.c            |    4 +-
- libavfilter/Makefile                 |    1 +
- libavfilter/allfilters.c             |    1 +
- libavfilter/vf_deinterlace_v4l2m2m.c | 1269 ++++++++++++++++++++++++++
- 4 files changed, 1273 insertions(+), 2 deletions(-)
- create mode 100644 libavfilter/vf_deinterlace_v4l2m2m.c
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ee5dc7b8d4..440dfaaba5 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -498,10 +498,10 @@ dequeue:
-             return NULL;
-         }
-         --ctx->q_count;
--        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d\n",
-+        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
-                ctx->name, buf.index,
-                buf.timestamp.tv_sec, buf.timestamp.tv_usec,
--               ctx->q_count, ++ctx->dq_count);
-+               ctx->q_count, ++ctx->dq_count, buf.field);
- 
-         avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-         avbuf->status = V4L2BUF_AVAILABLE;
-diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index c14fc995a0..0e7b5856bd 100644
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -262,6 +262,7 @@ OBJS-$(CONFIG_DEFLATE_FILTER)                += vf_neighbor.o
- OBJS-$(CONFIG_DEFLICKER_FILTER)              += vf_deflicker.o
- OBJS-$(CONFIG_DEINTERLACE_QSV_FILTER)        += vf_vpp_qsv.o
- OBJS-$(CONFIG_DEINTERLACE_VAAPI_FILTER)      += vf_deinterlace_vaapi.o vaapi_vpp.o
-+OBJS-$(CONFIG_DEINTERLACE_V4L2M2M_FILTER)    += vf_deinterlace_v4l2m2m.o
- OBJS-$(CONFIG_DEJUDDER_FILTER)               += vf_dejudder.o
- OBJS-$(CONFIG_DELOGO_FILTER)                 += vf_delogo.o
- OBJS-$(CONFIG_DENOISE_VAAPI_FILTER)          += vf_misc_vaapi.o vaapi_vpp.o
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index b990a00152..357ff61ca8 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -248,6 +248,7 @@ extern const AVFilter ff_vf_derain;
- extern const AVFilter ff_vf_deshake;
- extern const AVFilter ff_vf_deshake_opencl;
- extern const AVFilter ff_vf_despill;
-+extern const AVFilter ff_vf_deinterlace_v4l2m2m;
- extern const AVFilter ff_vf_detelecine;
- extern const AVFilter ff_vf_dilation;
- extern const AVFilter ff_vf_dilation_opencl;
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-new file mode 100644
-index 0000000000..1a933b7e0a
---- /dev/null
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -0,0 +1,1269 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * deinterlace video filter - V4L2 M2M
-+ */
-+
-+#include <drm_fourcc.h>
-+
-+#include <linux/videodev2.h>
-+
-+#include <dirent.h>
-+#include <fcntl.h>
-+#include <poll.h>
-+#include <stdatomic.h>
-+#include <stdio.h>
-+#include <string.h>
-+#include <sys/ioctl.h>
-+#include <sys/mman.h>
-+#include <unistd.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/avstring.h"
 +#include "libavutil/common.h"
-+#include "libavutil/hwcontext.h"
-+#include "libavutil/hwcontext_drm.h"
 +#include "libavutil/internal.h"
-+#include "libavutil/mathematics.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/time.h"
-+
-+#define FF_INTERNAL_FIELDS 1
-+#include "framequeue.h"
-+#include "filters.h"
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct V4L2Queue V4L2Queue;
-+typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
-+
-+typedef struct V4L2PlaneInfo {
-+    int bytesperline;
-+    size_t length;
-+} V4L2PlaneInfo;
-+
-+typedef struct V4L2Buffer {
-+    int enqueued;
-+    int reenqueue;
-+    int fd;
-+    struct v4l2_buffer buffer;
-+    AVFrame frame;
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-+    int num_planes;
-+    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
-+    AVDRMFrameDescriptor drm_frame;
-+    V4L2Queue *q;
-+} V4L2Buffer;
-+
-+typedef struct V4L2Queue {
-+    struct v4l2_format format;
-+    int num_buffers;
-+    V4L2Buffer *buffers;
-+    DeintV4L2M2MContextShared *ctx;
-+} V4L2Queue;
-+
-+typedef struct pts_stats_s
-+{
-+    void * logctx;
-+    const char * name;  // For debug
-+    unsigned int last_count;
-+    unsigned int last_interval;
-+    int64_t last_pts;
-+} pts_stats_t;
-+
-+#define PTS_TRACK_SIZE 32
-+typedef struct pts_track_el_s
-+{
-+    uint32_t n;
-+    unsigned int interval;
-+    AVFrame * props;
-+} pts_track_el_t;
-+
-+typedef struct pts_track_s
-+{
-+    uint32_t n;
-+    uint32_t last_n;
-+    int got_2;
-+    void * logctx;
-+    pts_stats_t stats;
-+    pts_track_el_t a[PTS_TRACK_SIZE];
-+} pts_track_t;
-+
-+typedef struct DeintV4L2M2MContextShared {
-+    void * logctx;  // For logging - will be NULL when done
-+
-+    int fd;
-+    int done;
-+    int width;
-+    int height;
-+    int orig_width;
-+    int orig_height;
-+    atomic_uint refcount;
-+
-+    AVBufferRef *hw_frames_ctx;
-+
-+    unsigned int field_order;
-+
-+    pts_track_t track;
-+
-+    V4L2Queue output;
-+    V4L2Queue capture;
-+} DeintV4L2M2MContextShared;
-+
-+typedef struct DeintV4L2M2MContext {
-+    const AVClass *class;
-+
-+    DeintV4L2M2MContextShared *shared;
-+} DeintV4L2M2MContext;
-+
-+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
-+{
-+    return stats->last_interval;
-+}
-+
-+// Pick 64 for max last count - that is >1sec at 60fps
-+#define STATS_LAST_COUNT_MAX 64
-+#define STATS_INTERVAL_MAX (1 << 30)
-+static void pts_stats_add(pts_stats_t * const stats, int64_t pts)
-+{
-+    if (pts == AV_NOPTS_VALUE || pts == stats->last_pts) {
-+        if (stats->last_count < STATS_LAST_COUNT_MAX)
-+            ++stats->last_count;
-+        return;
-+    }
-+
-+    if (stats->last_pts != AV_NOPTS_VALUE) {
-+        const int64_t interval = pts - stats->last_pts;
-+
-+        if (interval < 0 || interval >= STATS_INTERVAL_MAX ||
-+            stats->last_count >= STATS_LAST_COUNT_MAX) {
-+            if (stats->last_interval != 0)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: Bad interval: %" PRId64 "/%d\n",
-+                       __func__, stats->name, interval, stats->last_count);
-+            stats->last_interval = 0;
-+        }
-+        else {
-+            const int64_t frame_time = interval / (int64_t)stats->last_count;
-+
-+            if (frame_time != stats->last_interval)
-+                av_log(stats->logctx, AV_LOG_DEBUG, "%s: %s: New interval: %u->%" PRId64 "/%d=%" PRId64 "\n",
-+                       __func__, stats->name, stats->last_interval, interval, stats->last_count, frame_time);
-+            stats->last_interval = frame_time;
-+        }
-+    }
-+
-+    stats->last_pts = pts;
-+    stats->last_count = 1;
-+}
-+
-+static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char * name)
-+{
-+    *stats = (pts_stats_t){
-+        .logctx = logctx,
-+        .name = name,
-+        .last_count = 1,
-+        .last_interval = 0,
-+        .last_pts = AV_NOPTS_VALUE
-+    };
-+}
-+
-+static inline uint32_t pts_track_next_n(pts_track_t * const trk)
-+{
-+    if (++trk->n == 0)
-+        trk->n = 1;
-+    return trk->n;
-+}
-+
-+static int pts_track_get_frame(pts_track_t * const trk, const struct timeval tv, AVFrame * const dst)
-+{
-+    uint32_t n = (uint32_t)(tv.tv_usec / 2 + tv.tv_sec * 500000);
-+    pts_track_el_t * t;
-+
-+    // As a first guess assume that n==0 means last frame
-+    if (n == 0) {
-+        n = trk->last_n;
-+        if (n == 0)
-+            goto fail;
-+    }
-+
-+    t = trk->a + (n & (PTS_TRACK_SIZE - 1));
-+
-+    if (t->n != n) {
-+        av_log(trk->logctx, AV_LOG_ERROR, "%s: track failure: got %u, expected %u\n", __func__, n, trk->n);
-+        goto fail;
-+    }
-+
-+    // 1st frame is simple - just believe it
-+    if (n != trk->last_n) {
-+        trk->last_n = n;
-+        trk->got_2 = 0;
-+        return av_frame_copy_props(dst, t->props);
-+    }
-+
-+    // Only believe in a single interpolated frame
-+    if (trk->got_2)
-+        goto fail;
-+    trk->got_2 = 1;
-+
-+    av_frame_copy_props(dst, t->props);
-+
-+
-+    // If we can't guess - don't
-+    if (t->interval == 0) {
-+        dst->best_effort_timestamp = AV_NOPTS_VALUE;
-+        dst->pts = AV_NOPTS_VALUE;
-+        dst->pkt_dts = AV_NOPTS_VALUE;
-+    }
-+    else {
-+        if (dst->best_effort_timestamp != AV_NOPTS_VALUE)
-+            dst->best_effort_timestamp += t->interval / 2;
-+        if (dst->pts != AV_NOPTS_VALUE)
-+            dst->pts += t->interval / 2;
-+        if (dst->pkt_dts != AV_NOPTS_VALUE)
-+            dst->pkt_dts += t->interval / 2;
-+    }
-+
-+    return 0;
-+
-+fail:
-+    trk->last_n = 0;
-+    trk->got_2 = 0;
-+    dst->pts = AV_NOPTS_VALUE;
-+    dst->pkt_dts = AV_NOPTS_VALUE;
-+    return 0;
-+}
-+
-+static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
-+{
-+    const uint32_t n = pts_track_next_n(trk);
-+    pts_track_el_t * const t = trk->a + (n & (PTS_TRACK_SIZE - 1));
-+
-+    pts_stats_add(&trk->stats, src->pts);
-+
-+    t->n = n;
-+    t->interval = pts_stats_interval(&trk->stats); // guess that next interval is the same as the last
-+    av_frame_unref(t->props);
-+    av_frame_copy_props(t->props, src);
-+
-+    // We now know what the previous interval was, rather than having to guess,
-+    // so set it.  There is a better than decent chance that this is before
-+    // we use it.
-+    if (t->interval != 0) {
-+        pts_track_el_t * const prev_t = trk->a + ((n - 1) & (PTS_TRACK_SIZE - 1));
-+        prev_t->interval = t->interval;
-+    }
-+
-+    // In case deinterlace interpolates frames use every other usec
-+    return (struct timeval){.tv_sec = n / 500000, .tv_usec = (n % 500000) * 2};
-+}
-+
-+static void pts_track_uninit(pts_track_t * const trk)
-+{
-+    unsigned int i;
-+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
-+        trk->a[i].n = 0;
-+        av_frame_free(&trk->a[i].props);
-+    }
-+}
-+
-+static int pts_track_init(pts_track_t * const trk, void *logctx)
-+{
-+    unsigned int i;
-+    trk->n = 1;
-+    pts_stats_init(&trk->stats, logctx, "track");
-+    for (i = 0; i != PTS_TRACK_SIZE; ++i) {
-+        trk->a[i].n = 0;
-+        if ((trk->a[i].props = av_frame_alloc()) == NULL) {
-+            pts_track_uninit(trk);
-+            return AVERROR(ENOMEM);
-+        }
-+    }
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
-+{
-+    struct v4l2_capability cap;
-+    int ret;
-+
-+    memset(&cap, 0, sizeof(cap));
-+    ret = ioctl(ctx->fd, VIDIOC_QUERYCAP, &cap);
-+    if (ret < 0)
-+        return ret;
-+
-+    if (!(cap.capabilities & V4L2_CAP_STREAMING))
-+        return AVERROR(EINVAL);
-+
-+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
-+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+
-+        return 0;
-+    }
-+
-+    if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
-+        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-+        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+
-+        return 0;
-+    }
-+
-+    return AVERROR(EINVAL);
-+}
-+
-+static int deint_v4l2m2m_try_format(V4L2Queue *queue)
-+{
-+    struct v4l2_format *fmt        = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    int ret, field;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
-+
-+    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
-+        field = V4L2_FIELD_INTERLACED_TB;
-+    else
-+        field = V4L2_FIELD_NONE;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
-+        fmt->fmt.pix_mp.field = field;
-+        fmt->fmt.pix_mp.width = ctx->width;
-+        fmt->fmt.pix_mp.height = ctx->height;
-+    } else {
-+        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
-+        fmt->fmt.pix.field = field;
-+        fmt->fmt.pix.width = ctx->width;
-+        fmt->fmt.pix.height = ctx->height;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
-+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-+		 fmt->fmt.pix_mp.pixelformat,
-+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+
-+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
-+    if (ret)
-+        return AVERROR(EINVAL);
-+
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
-+		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
-+		 fmt->fmt.pix_mp.pixelformat,
-+		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+            fmt->fmt.pix_mp.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+
-+            return AVERROR(EINVAL);
-+        }
-+    } else {
-+        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+            fmt->fmt.pix.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
-+{
-+    struct v4l2_format *fmt        = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    int ret;
-+
-+    struct v4l2_selection sel = {
-+        .type = fmt->type,
-+        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
-+    };
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.field = field;
-+        fmt->fmt.pix_mp.width = width;
-+        fmt->fmt.pix_mp.height = ysize / pitch;
-+        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
-+        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
-+    } else {
-+        fmt->fmt.pix.field = field;
-+        fmt->fmt.pix.width = width;
-+        fmt->fmt.pix.height = height;
-+        fmt->fmt.pix.sizeimage = 0;
-+        fmt->fmt.pix.bytesperline = 0;
-+    }
-+
-+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
-+
-+    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
-+
-+    sel.r.width = width;
-+    sel.r.height = height;
-+    sel.r.left = 0;
-+    sel.r.top = 0;
-+    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
-+    sel.flags = V4L2_SEL_FLAG_LE;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
-+    if (ret)
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
-+{
-+    int ret;
-+
-+    ctx->fd = open(node, O_RDWR | O_NONBLOCK, 0);
-+    if (ctx->fd < 0)
-+        return AVERROR(errno);
-+
-+    ret = deint_v4l2m2m_prepare_context(ctx);
-+    if (ret)
-+        goto fail;
-+
-+    ret = deint_v4l2m2m_try_format(&ctx->capture);
-+    if (ret)
-+        goto fail;
-+
-+    ret = deint_v4l2m2m_try_format(&ctx->output);
-+    if (ret)
-+        goto fail;
-+
-+    return 0;
-+
-+fail:
-+    close(ctx->fd);
-+    ctx->fd = -1;
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_find_device(DeintV4L2M2MContextShared *ctx)
-+{
-+    int ret = AVERROR(EINVAL);
-+    struct dirent *entry;
-+    char node[PATH_MAX];
-+    DIR *dirp;
-+
-+    dirp = opendir("/dev");
-+    if (!dirp)
-+        return AVERROR(errno);
-+
-+    for (entry = readdir(dirp); entry; entry = readdir(dirp)) {
-+
-+        if (strncmp(entry->d_name, "video", 5))
-+            continue;
-+
-+        snprintf(node, sizeof(node), "/dev/%s", entry->d_name);
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "probing device %s\n", node);
-+        ret = deint_v4l2m2m_probe_device(ctx, node);
-+        if (!ret)
-+            break;
-+    }
-+
-+    closedir(dirp);
-+
-+    if (ret) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "Could not find a valid device\n");
-+        ctx->fd = -1;
-+
-+        return ret;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_INFO, "Using device %s\n", node);
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
-+{
-+    int ret;
-+
-+    ret = ioctl(buf->q->ctx->fd, VIDIOC_QBUF, &buf->buffer);
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    buf->enqueued = 1;
-+
-+    return 0;
-+}
-+
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-+{
-+    struct v4l2_exportbuffer expbuf;
-+    int i, ret;
-+
-+    for (i = 0; i < avbuf->num_planes; i++) {
-+        memset(&expbuf, 0, sizeof(expbuf));
-+
-+        expbuf.index = avbuf->buffer.index;
-+        expbuf.type = avbuf->buffer.type;
-+        expbuf.plane = i;
-+
-+        ret = ioctl(avbuf->q->ctx->fd, VIDIOC_EXPBUF, &expbuf);
-+        if (ret < 0)
-+            return AVERROR(errno);
-+
-+        avbuf->fd = expbuf.fd;
-+
-+        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
-+            avbuf->drm_frame.objects[i].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        } else {
-+            /* drm frame */
-+            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
-+            avbuf->drm_frame.objects[0].fd = expbuf.fd;
-+            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-+{
-+    struct v4l2_format *fmt = &queue->format;
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    struct v4l2_requestbuffers req;
-+    int ret, i, j, multiplanar;
-+    uint32_t memory;
-+
-+    memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
-+        V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-+
-+    multiplanar = V4L2_TYPE_IS_MULTIPLANAR(fmt->type);
-+
-+    memset(&req, 0, sizeof(req));
-+    req.count = queue->num_buffers;
-+    req.memory = memory;
-+    req.type = fmt->type;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_REQBUFS, &req);
-+    if (ret < 0) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_REQBUFS failed: %s\n", strerror(errno));
-+
-+        return AVERROR(errno);
-+    }
-+
-+    queue->num_buffers = req.count;
-+    queue->buffers = av_mallocz(queue->num_buffers * sizeof(V4L2Buffer));
-+    if (!queue->buffers) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "malloc enomem\n");
-+
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    for (i = 0; i < queue->num_buffers; i++) {
-+        V4L2Buffer *buf = &queue->buffers[i];
-+
-+        buf->enqueued = 0;
-+        buf->fd = -1;
-+        buf->q = queue;
-+
-+        buf->buffer.type = fmt->type;
-+        buf->buffer.memory = memory;
-+        buf->buffer.index = i;
-+
-+        if (multiplanar) {
-+            buf->buffer.length = VIDEO_MAX_PLANES;
-+            buf->buffer.m.planes = buf->planes;
-+        }
-+
-+        ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
-+        if (ret < 0) {
-+            ret = AVERROR(errno);
-+
-+            goto fail;
-+        }
-+
-+        if (multiplanar)
-+            buf->num_planes = buf->buffer.length;
-+        else
-+            buf->num_planes = 1;
-+
-+        for (j = 0; j < buf->num_planes; j++) {
-+            V4L2PlaneInfo *info = &buf->plane_info[j];
-+
-+            if (multiplanar) {
-+                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
-+                info->length = buf->buffer.m.planes[j].length;
-+            } else {
-+                info->bytesperline = fmt->fmt.pix.bytesperline;
-+                info->length = buf->buffer.length;
-+            }
-+        }
-+
-+        if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
-+            ret = deint_v4l2m2m_enqueue_buffer(buf);
-+            if (ret)
-+                goto fail;
-+
-+            ret = v4l2_buffer_export_drm(buf);
-+            if (ret)
-+                goto fail;
-+        }
-+    }
-+
-+    return 0;
-+
-+fail:
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (queue->buffers[i].fd >= 0)
-+            close(queue->buffers[i].fd);
-+    av_free(queue->buffers);
-+    queue->buffers = NULL;
-+
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_streamon(V4L2Queue *queue)
-+{
-+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
-+    int type = queue->format.type;
-+    int ret;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_STREAMON, &type);
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_streamoff(V4L2Queue *queue)
-+{
-+    DeintV4L2M2MContextShared * const ctx = queue->ctx;
-+    int type = queue->format.type;
-+    int ret;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_STREAMOFF, &type);
-+    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: type:%d ret:%d errno:%d\n", __func__, type, ret, AVERROR(errno));
-+    if (ret < 0)
-+        return AVERROR(errno);
-+
-+    return 0;
-+}
-+
-+// timeout in ms
-+static V4L2Buffer* deint_v4l2m2m_dequeue_buffer(V4L2Queue *queue, int timeout)
-+{
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    struct v4l2_buffer buf = { 0 };
-+    V4L2Buffer* avbuf = NULL;
-+    struct pollfd pfd;
-+    short events;
-+    int ret;
-+
-+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
-+        events =  POLLOUT | POLLWRNORM;
-+    else
-+        events = POLLIN | POLLRDNORM;
-+
-+    pfd.events = events;
-+    pfd.fd = ctx->fd;
-+
-+    for (;;) {
-+        ret = poll(&pfd, 1, timeout);
-+        if (ret > 0)
-+            break;
-+        if (errno == EINTR)
-+            continue;
-+        return NULL;
-+    }
-+
-+    if (pfd.revents & POLLERR)
-+        return NULL;
-+
-+    if (pfd.revents & events) {
-+        memset(&buf, 0, sizeof(buf));
-+        buf.memory = V4L2_MEMORY_MMAP;
-+        buf.type = queue->format.type;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
-+            memset(planes, 0, sizeof(planes));
-+            buf.length = VIDEO_MAX_PLANES;
-+            buf.m.planes = planes;
-+        }
-+
-+        ret = ioctl(ctx->fd, VIDIOC_DQBUF, &buf);
-+        if (ret) {
-+            if (errno != EAGAIN)
-+                av_log(ctx->logctx, AV_LOG_DEBUG, "VIDIOC_DQBUF, errno (%s)\n",
-+                       av_err2str(AVERROR(errno)));
-+            return NULL;
-+        }
-+
-+        avbuf = &queue->buffers[buf.index];
-+        avbuf->enqueued = 0;
-+        avbuf->buffer = buf;
-+        if (V4L2_TYPE_IS_MULTIPLANAR(queue->format.type)) {
-+            memcpy(avbuf->planes, planes, sizeof(planes));
-+            avbuf->buffer.m.planes = avbuf->planes;
-+        }
-+        return avbuf;
-+    }
-+
-+    return NULL;
-+}
-+
-+static V4L2Buffer *deint_v4l2m2m_find_free_buf(V4L2Queue *queue)
-+{
-+    int i;
-+    V4L2Buffer *buf = NULL;
-+
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (!queue->buffers[i].enqueued) {
-+            buf = &queue->buffers[i];
-+            break;
-+        }
-+    return buf;
-+}
-+
-+static void deint_v4l2m2m_unref_queued(V4L2Queue *queue)
-+{
-+    int i;
-+    V4L2Buffer *buf = NULL;
-+
-+    if (!queue || !queue->buffers)
-+        return;
-+    for (i = 0; i < queue->num_buffers; i++) {
-+        buf = &queue->buffers[i];
-+        if (queue->buffers[i].enqueued)
-+            av_frame_unref(&buf->frame);
-+    }
-+}
-+
-+static void recycle_q(V4L2Queue * const queue)
-+{
-+    V4L2Buffer* avbuf;
-+    while (avbuf = deint_v4l2m2m_dequeue_buffer(queue, 0), avbuf) {
-+        av_frame_unref(&avbuf->frame);
-+    }
-+}
-+
-+static int count_enqueued(V4L2Queue *queue)
-+{
-+    int i;
-+    int n = 0;
-+
-+    if (queue->buffers == NULL)
-+        return 0;
-+
-+    for (i = 0; i < queue->num_buffers; i++)
-+        if (queue->buffers[i].enqueued)
-+            ++n;
-+    return n;
-+}
-+
-+static int deint_v4l2m2m_enqueue_frame(V4L2Queue * const queue, AVFrame * const frame)
-+{
-+    DeintV4L2M2MContextShared *const ctx = queue->ctx;
-+    AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)frame->data[0];
-+    V4L2Buffer *buf;
-+    int i;
-+
-+    if (V4L2_TYPE_IS_OUTPUT(queue->format.type))
-+        recycle_q(queue);
-+
-+    buf = deint_v4l2m2m_find_free_buf(queue);
-+    if (!buf) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d finding free buf\n", __func__, 0);
-+        return AVERROR(EAGAIN);
-+    }
-+    if (V4L2_TYPE_IS_MULTIPLANAR(buf->buffer.type))
-+        for (i = 0; i < drm_desc->nb_objects; i++)
-+            buf->buffer.m.planes[i].m.fd = drm_desc->objects[i].fd;
-+    else
-+        buf->buffer.m.fd = drm_desc->objects[0].fd;
-+
-+    buf->buffer.field = !frame->interlaced_frame ? V4L2_FIELD_NONE :
-+        frame->top_field_first ? V4L2_FIELD_INTERLACED_TB :
-+            V4L2_FIELD_INTERLACED_BT;
-+
-+    if (ctx->field_order != buf->buffer.field) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Field changed: %d->%d\n", __func__, ctx->field_order, buf->buffer.field);
-+        ctx->field_order = buf->buffer.field;
-+    }
-+
-+    buf->buffer.timestamp = pts_track_add_frame(&ctx->track, frame);
-+
-+    buf->drm_frame.objects[0].fd = drm_desc->objects[0].fd;
-+
-+    av_frame_move_ref(&buf->frame, frame);
-+
-+    return deint_v4l2m2m_enqueue_buffer(buf);
-+}
-+
-+static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
-+{
-+    if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
-+        V4L2Queue *capture = &ctx->capture;
-+        V4L2Queue *output  = &ctx->output;
-+        int i;
-+
-+        av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
-+
-+        if (ctx->fd >= 0) {
-+            deint_v4l2m2m_streamoff(capture);
-+            deint_v4l2m2m_streamoff(output);
-+        }
-+
-+        if (capture->buffers)
-+            for (i = 0; i < capture->num_buffers; i++) {
-+                capture->buffers[i].q = NULL;
-+                if (capture->buffers[i].fd >= 0)
-+                    close(capture->buffers[i].fd);
-+            }
-+
-+        deint_v4l2m2m_unref_queued(output);
-+
-+        av_buffer_unref(&ctx->hw_frames_ctx);
-+
-+        if (capture->buffers)
-+            av_free(capture->buffers);
-+
-+        if (output->buffers)
-+            av_free(output->buffers);
-+
-+        if (ctx->fd >= 0) {
-+            close(ctx->fd);
-+            ctx->fd = -1;
-+        }
-+
-+        av_free(ctx);
-+    }
-+}
-+
-+static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-+{
-+    V4L2Buffer *buf                = opaque;
-+    DeintV4L2M2MContextShared *ctx = buf->q->ctx;
-+
-+    if (!ctx->done)
-+        deint_v4l2m2m_enqueue_buffer(buf);
-+
-+    deint_v4l2m2m_destroy_context(ctx);
-+}
-+
-+static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-+{
-+    int av_pix_fmt = AV_PIX_FMT_YUV420P;
-+    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor *layer;
-+
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_objects = avbuf->num_planes;
-+    drm_desc->nb_layers = 1;
-+
-+    layer = &drm_desc->layers[0];
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-+    }
-+
-+    switch (av_pix_fmt) {
-+    case AV_PIX_FMT_YUYV422:
-+
-+        layer->format = DRM_FORMAT_YUYV;
-+        layer->nb_planes = 1;
-+
-+        break;
-+
-+    case AV_PIX_FMT_NV12:
-+    case AV_PIX_FMT_NV21:
-+
-+        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
-+            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 2;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-+        break;
-+
-+    case AV_PIX_FMT_YUV420P:
-+
-+        layer->format = DRM_FORMAT_YUV420;
-+
-+        if (avbuf->num_planes > 1)
-+            break;
-+
-+        layer->nb_planes = 3;
-+
-+        layer->planes[1].object_index = 0;
-+        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
-+            height;
-+        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+
-+        layer->planes[2].object_index = 0;
-+        layer->planes[2].offset = layer->planes[1].offset +
-+            ((avbuf->plane_info[0].bytesperline *
-+              height) >> 2);
-+        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
-+        break;
-+
-+    default:
-+        drm_desc->nb_layers = 0;
-+        break;
-+    }
-+
-+    return (uint8_t *) drm_desc;
-+}
-+
-+// timeout in ms
-+static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
-+{
-+    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    V4L2Buffer* avbuf;
-+
-+    av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
-+    if (!avbuf) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
-+        return AVERROR(EAGAIN);
-+    }
-+
-+    // Fill in PTS and anciliary info from src frame
-+    // we will want to overwrite some fields as only the pts/dts
-+    // fields are updated with new timing in this fn
-+    pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
-+
-+    frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
-+                            sizeof(avbuf->drm_frame), v4l2_free_buffer,
-+                            avbuf, AV_BUFFER_FLAG_READONLY);
-+    if (!frame->buf[0]) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "%s: error %d creating buffer\n", __func__, 0);
-+        return AVERROR(ENOMEM);
-+    }
-+
-+    atomic_fetch_add(&ctx->refcount, 1);
-+
-+    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
-+    frame->format = AV_PIX_FMT_DRM_PRIME;
-+    if (ctx->hw_frames_ctx)
-+        frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
-+    frame->height = ctx->height;
-+    frame->width = ctx->width;
-+
-+    // Not interlaced now
-+    frame->interlaced_frame = 0;
-+    frame->top_field_first = 0;
-+    // Pkt duration halved
-+    frame->pkt_duration /= 2;
-+
-+    if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
-+        frame->decode_error_flags |= FF_DECODE_ERROR_INVALID_BITSTREAM;
-+    }
-+
-+    av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: PTS=%"PRId64"\n", __func__, frame->pts);
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
-+{
-+    AVFilterLink *inlink           = outlink->src->inputs[0];
-+    AVFilterContext *avctx         = outlink->src;
-+    DeintV4L2M2MContext *priv      = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+    int ret;
-+
-+    ctx->height = avctx->inputs[0]->h;
-+    ctx->width = avctx->inputs[0]->w;
-+
-+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
-+
-+    outlink->time_base           = inlink->time_base;
-+    outlink->w                   = inlink->w;
-+    outlink->h                   = inlink->h;
-+    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
-+    outlink->format              = inlink->format;
-+    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
-+
-+    ret = deint_v4l2m2m_find_device(ctx);
-+    if (ret)
-+        return ret;
-+
-+    if (inlink->hw_frames_ctx) {
-+        ctx->hw_frames_ctx = av_buffer_ref(inlink->hw_frames_ctx);
-+        if (!ctx->hw_frames_ctx)
-+            return AVERROR(ENOMEM);
-+    }
-+    return 0;
-+}
-+
-+static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+    AVFilterContext *avctx         = link->dst;
-+    DeintV4L2M2MContext *priv      = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+    V4L2Queue *capture             = &ctx->capture;
-+    V4L2Queue *output              = &ctx->output;
-+    int ret;
-+
-+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
-+          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
-+    av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
-+           avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
-+
-+    if (ctx->field_order == V4L2_FIELD_ANY) {
-+        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-+        ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
-+        ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
-+
-+        av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
-+           drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
-+
-+        if (in->top_field_first)
-+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-+        else
-+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
-+
-+        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_allocate_buffers(capture);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_streamon(capture);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_allocate_buffers(output);
-+        if (ret)
-+            return ret;
-+
-+        ret = deint_v4l2m2m_streamon(output);
-+        if (ret)
-+            return ret;
-+    }
-+
-+    ret = deint_v4l2m2m_enqueue_frame(output, in);
-+
-+    av_log(priv, AV_LOG_TRACE, ">>> %s: %s\n", __func__, av_err2str(ret));
-+    return ret;
-+}
-+
-+static int deint_v4l2m2m_activate(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext * const priv = avctx->priv;
-+    DeintV4L2M2MContextShared *const s = priv->shared;
-+    AVFilterLink * const outlink = avctx->outputs[0];
-+    AVFilterLink * const inlink = avctx->inputs[0];
-+    int n = 0;
-+    int cn = 99;
-+    int instatus = 0;
-+    int64_t inpts = 0;
-+    int did_something = 0;
-+
-+    av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
-+
-+    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
-+
-+    if (!ff_outlink_frame_wanted(outlink)) {
-+        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
-+    }
-+    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
-+    {
-+        AVFrame * frame = av_frame_alloc();
-+        int rv;
-+
-+again:
-+        recycle_q(&s->output);
-+        n = count_enqueued(&s->output);
-+
-+        if (frame == NULL) {
-+            av_log(priv, AV_LOG_ERROR, "%s: error allocating frame\n", __func__);
-+            return AVERROR(ENOMEM);
-+        }
-+
-+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
-+        if (rv != 0) {
-+            av_frame_free(&frame);
-+            if (rv != AVERROR(EAGAIN)) {
-+                av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
-+                return rv;
-+            }
-+        }
-+        else {
-+            frame->interlaced_frame = 0;
-+            // frame is always consumed by filter_frame - even on error despite
-+            // a somewhat confusing comment in the header
-+            rv = ff_filter_frame(outlink, frame);
-+
-+            if (instatus != 0) {
-+                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
-+                goto again;
-+            }
-+
-+            av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
-+            did_something = 1;
-+        }
-+
-+        cn = count_enqueued(&s->capture);
-+    }
-+
-+    if (instatus != 0) {
-+        ff_outlink_set_status(outlink, instatus, inpts);
-+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
-+        return 0;
-+    }
-+
-+    {
-+        AVFrame * frame;
-+        int rv;
-+
-+        recycle_q(&s->output);
-+        n = count_enqueued(&s->output);
-+
-+        while (n < 6) {
-+            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
-+                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
-+                return rv;
-+            }
-+
-+            if (frame == NULL) {
-+                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-+                break;
-+            }
-+
-+            deint_v4l2m2m_filter_frame(inlink, frame);
-+            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-+            ++n;
-+        }
-+    }
-+
-+    if (n < 6) {
-+        ff_inlink_request_frame(inlink);
-+        did_something = 1;
-+        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
-+    }
-+
-+    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
-+        ff_filter_set_ready(avctx, 1);
-+        did_something = 1;
-+        av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
-+    }
-+
-+    av_log(priv, AV_LOG_TRACE, ">>> %s: OK (n=%d, cn=%d)\n", __func__, n, cn);
-+    return did_something ? 0 : FFERROR_NOT_READY;
-+}
-+
-+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext * const priv = avctx->priv;
-+    DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
-+
-+    if (!ctx) {
-+        av_log(priv, AV_LOG_ERROR, "%s: error %d allocating context\n", __func__, 0);
-+        return AVERROR(ENOMEM);
-+    }
-+    priv->shared = ctx;
-+    ctx->logctx = priv;
-+    ctx->fd = -1;
-+    ctx->output.ctx = ctx;
-+    ctx->output.num_buffers = 8;
-+    ctx->capture.ctx = ctx;
-+    ctx->capture.num_buffers = 12;
-+    ctx->done = 0;
-+    ctx->field_order = V4L2_FIELD_ANY;
-+
-+    pts_track_init(&ctx->track, priv);
-+
-+    atomic_init(&ctx->refcount, 1);
-+
-+    return 0;
-+}
-+
-+static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
-+{
-+    DeintV4L2M2MContext *priv = avctx->priv;
-+    DeintV4L2M2MContextShared *ctx = priv->shared;
-+
-+    ctx->done = 1;
-+    ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
-+    pts_track_uninit(&ctx->track);
-+    deint_v4l2m2m_destroy_context(ctx);
-+}
-+
-+static const AVOption deinterlace_v4l2m2m_options[] = {
-+    { NULL },
-+};
-+
-+AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
-+
-+static const AVFilterPad deint_v4l2m2m_inputs[] = {
-+    {
-+        .name         = "default",
-+        .type         = AVMEDIA_TYPE_VIDEO,
-+    },
-+};
-+
-+static const AVFilterPad deint_v4l2m2m_outputs[] = {
-+    {
-+        .name          = "default",
-+        .type          = AVMEDIA_TYPE_VIDEO,
-+        .config_props  = deint_v4l2m2m_config_props,
-+    },
-+};
-+
-+AVFilter ff_vf_deinterlace_v4l2m2m = {
-+    .name           = "deinterlace_v4l2m2m",
-+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M deinterlacer"),
-+    .priv_size      = sizeof(DeintV4L2M2MContext),
-+    .init           = &deint_v4l2m2m_init,
-+    .uninit         = &deint_v4l2m2m_uninit,
-+    FILTER_INPUTS(deint_v4l2m2m_inputs),
-+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
-+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
-+    .priv_class     = &deinterlace_v4l2m2m_class,
-+    .activate       = deint_v4l2m2m_activate,
-+};
-
-From 0fb00e51d1ca40eed22bfc66b7f309fdc56229bc Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 2 Dec 2021 17:49:55 +0000
-Subject: [PATCH 031/136] Put no_pts_rescale in context which makes more sense
- than an arg
-
----
- libavcodec/v4l2_buffers.c | 28 ++++++++++++++--------------
- libavcodec/v4l2_buffers.h |  5 ++---
- libavcodec/v4l2_context.c |  8 ++++----
- libavcodec/v4l2_context.h | 13 +++++++++----
- libavcodec/v4l2_m2m_dec.c |  9 +++++----
- 5 files changed, 34 insertions(+), 29 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 126d2a17f4..22da6bd722 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -39,7 +39,7 @@
- #define USEC_PER_SEC 1000000
- static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
- 
--static inline V4L2m2mContext *buf_to_m2mctx(V4L2Buffer *buf)
-+static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
- {
-     return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
-         container_of(buf->context, V4L2m2mContext, output) :
-@@ -51,34 +51,34 @@ static inline AVCodecContext *logger(V4L2Buffer *buf)
-     return buf_to_m2mctx(buf)->avctx;
- }
- 
--static inline AVRational v4l2_get_timebase(V4L2Buffer *avbuf)
-+static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
- {
--    V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-+    const V4L2m2mContext *s = buf_to_m2mctx(avbuf);
-     const AVRational tb = s->avctx->pkt_timebase.num ?
-         s->avctx->pkt_timebase :
-         s->avctx->time_base;
-     return tb.num && tb.den ? tb : v4l2_timebase;
- }
- 
--static inline void v4l2_set_pts(V4L2Buffer *out, int64_t pts, int no_rescale)
-+static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
- {
-     /* convert pts to v4l2 timebase */
-     const int64_t v4l2_pts =
--        no_rescale ? pts :
-+        out->context->no_pts_rescale ? pts :
-         pts == AV_NOPTS_VALUE ? 0 :
-             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
-     out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
-     out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
- }
- 
--static inline int64_t v4l2_get_pts(V4L2Buffer *avbuf, int no_rescale)
-+static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
- {
-     /* convert pts back to encoder timebase */
-     const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
-                         avbuf->buf.timestamp.tv_usec;
- 
-     return
--        no_rescale ? v4l2_pts :
-+        avbuf->context->no_pts_rescale ? v4l2_pts :
-         v4l2_pts == 0 ? AV_NOPTS_VALUE :
-             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
- }
-@@ -686,13 +686,13 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
-     v4l2_set_color_range(out, frame->color_range);
-     // PTS & interlace are buffer vars
--    v4l2_set_pts(out, frame->pts, 0);
-+    v4l2_set_pts(out, frame->pts);
-     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
- 
-     return v4l2_buffer_swframe_to_buf(frame, out);
- }
- 
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_rescale_pts)
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- {
-     int ret;
-     V4L2Context * const ctx = avbuf->context;
-@@ -710,7 +710,7 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf, int no_resc
-     frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-     frame->color_trc = v4l2_get_color_trc(avbuf);
--    frame->pts = v4l2_get_pts(avbuf, no_rescale_pts);
-+    frame->pts = v4l2_get_pts(avbuf);
-     frame->pkt_dts = AV_NOPTS_VALUE;
-     frame->interlaced_frame = v4l2_buf_is_interlaced(avbuf);
-     frame->top_field_first = v4l2_buf_is_top_first(avbuf);
-@@ -757,13 +757,13 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
-         pkt->flags |= AV_PKT_FLAG_CORRUPT;
-     }
- 
--    pkt->dts = pkt->pts = v4l2_get_pts(avbuf, 0);
-+    pkt->dts = pkt->pts = v4l2_get_pts(avbuf);
- 
-     return 0;
- }
- 
- int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
--                                    const void *extdata, size_t extlen, int no_rescale_pts)
-+                                    const void *extdata, size_t extlen)
- {
-     int ret;
- 
-@@ -777,7 +777,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-     if (ret && ret != AVERROR(ENOMEM))
-         return ret;
- 
--    v4l2_set_pts(out, pkt->pts, no_rescale_pts);
-+    v4l2_set_pts(out, pkt->pts);
- 
-     if (pkt->flags & AV_PKT_FLAG_KEY)
-         out->flags = V4L2_BUF_FLAG_KEYFRAME;
-@@ -787,7 +787,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
- 
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
- {
--    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
-+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
- }
- 
- 
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 111526aee3..641e0e147b 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -83,12 +83,11 @@ typedef struct V4L2Buffer {
-  *
-  * @param[in] frame The AVFRame to push the information to
-  * @param[in] buf The V4L2Buffer to get the information from
-- * @param[in] no_rescale_pts If non-zero do not rescale PTS
-  *
-  * @returns 0 in case of success, AVERROR(EINVAL) if the number of planes is incorrect,
-  * AVERROR(ENOMEM) if the AVBufferRef can't be created.
-  */
--int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf, int no_rescale_pts);
-+int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *buf);
- 
- /**
-  * Extracts the data from a V4L2Buffer to an AVPacket
-@@ -113,7 +112,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
- 
- int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
--                                    const void *extdata, size_t extlen, int no_rescale_pts);
-+                                    const void *extdata, size_t extlen);
- 
- /**
-  * Extracts the data from an AVFrame to a V4L2Buffer
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 440dfaaba5..64540a37b3 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -808,7 +808,7 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
- }
- 
- int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
--                                   const void * extdata, size_t extlen, int no_rescale_pts)
-+                                   const void * extdata, size_t extlen)
- {
-     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-     V4L2Buffer* avbuf;
-@@ -827,7 +827,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-     if (!avbuf)
-         return AVERROR(EAGAIN);
- 
--    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, no_rescale_pts);
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
-     if (ret == AVERROR(ENOMEM))
-         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
-                __func__, pkt->size, avbuf->planes[0].length);
-@@ -837,7 +837,7 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-     return ff_v4l2_buffer_enqueue(avbuf);
- }
- 
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout, int no_rescale_pts)
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
- {
-     V4L2Buffer *avbuf;
- 
-@@ -854,7 +854,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout,
-         return AVERROR(EAGAIN);
-     }
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf, no_rescale_pts);
-+    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 37b0431400..4cc164886c 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -102,6 +102,13 @@ typedef struct V4L2Context {
-      */
-     int done;
- 
-+    /**
-+     * PTS rescale not wanted
-+     * If the PTS is just a dummy frame count then rescale is
-+     * actively harmful
-+     */
-+    int no_pts_rescale;
-+
-     AVBufferRef *frames_ref;
-     int q_count;
-     int dq_count;
-@@ -172,12 +179,10 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
-  * @param[in] ctx The V4L2Context to dequeue from.
-  * @param[inout] f The AVFrame to dequeue to.
-  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-- * @param[in] no_rescale_pts (0 rescale pts, 1 use pts as
-- *       timestamp directly)
-  *
-  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
-  */
--int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int no_rescale_pts);
-+int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
- 
- /**
-  * Enqueues a buffer to a V4L2Context from an AVPacket
-@@ -189,7 +194,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout, int
-  * @param[in] pkt A pointer to an AVPacket.
-  * @return 0 in case of success, a negative error otherwise.
-  */
--int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size, int no_rescale_pts);
-+int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const void * ext_data, size_t ext_size);
- 
- /**
-  * Enqueues a buffer to a V4L2Context from an AVFrame
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index b49f470c0a..36754b314a 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -360,7 +360,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-             if (!s->draining) {
-                 // Calling enqueue with an empty pkt starts drain
-                 av_assert0(s->buf_pkt.size == 0);
--                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0, 1);
-+                ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-                 if (ret) {
-                     av_log(avctx, AV_LOG_ERROR, "Failed to start drain: ret=%d\n", ret);
-                     return ret;
-@@ -381,8 +381,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-         return ret;
- 
-     ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
--                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size,
--                                         1);
-+                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);
- 
-     if (ret == AVERROR(EAGAIN)) {
-         // Out of input buffers - keep packet
-@@ -442,7 +441,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 // when discarding
-                 // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
-                 // but there is room in the input Q
--                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1, 1);
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);
- 
-                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-@@ -569,10 +568,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     output->av_codec_id = avctx->codec_id;
-     output->av_pix_fmt  = AV_PIX_FMT_NONE;
-     output->min_buf_size = max_coded_size(avctx);
-+    output->no_pts_rescale = 1;
- 
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
-     capture->min_buf_size = 0;
-+    capture->no_pts_rescale = 1;
- 
-     /* the client requests the codec to generate DRM frames:
-      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-
-From 5e36908e6f2f06b68e85873cbcd421c0973f6409 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 8 Dec 2021 15:00:37 +0000
-Subject: [PATCH 032/136] Use bitbuf min size for all streams
-
----
- libavcodec/v4l2_m2m_dec.c | 5 +----
- 1 file changed, 1 insertion(+), 4 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 36754b314a..48a6810d18 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -507,15 +507,12 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- }
- #endif
- 
-+// This heuristic is for H264 but use for everything
- static uint32_t max_coded_size(const AVCodecContext * const avctx)
- {
-     uint32_t wxh = avctx->coded_width * avctx->coded_height;
-     uint32_t size;
- 
--    // Currently the only thing we try to set our own limits for is H264
--    if (avctx->codec_id != AV_CODEC_ID_H264)
--        return 0;
--
-     size = wxh * 3 / 2;
-     // H.264 Annex A table A-1 gives minCR which is either 2 or 4
-     // unfortunately that doesn't yield an actually useful limit
-
-From 5fcbcd31761eea31dc0157793f558eaaadfe2ac3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 3 Dec 2021 12:54:18 +0000
-Subject: [PATCH 033/136] Track pending frames in v4l2 stateful
-
-Track which frames are pending decode in the v4l2 stateful decoder.
-This relies on DTS & PTS having some relationship to reality, so
-any use of this code must cope with the results being wrong.
-
-Also moves the xlat state vars out of the main context and into their
-own structure.
----
- libavcodec/v4l2_m2m.h     |  15 ++++--
- libavcodec/v4l2_m2m_dec.c | 100 +++++++++++++++++++++++++++++---------
- 2 files changed, 89 insertions(+), 26 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 82feb0afdb..3f86809623 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -44,8 +44,10 @@
- #define FF_V4L2_M2M_TRACK_SIZE 128
- typedef struct V4L2m2mTrackEl {
-     int     discard;   // If we see this buffer its been flushed, so discard
-+    int     pending;
-     int     pkt_size;
-     int64_t pts;
-+    int64_t dts;
-     int64_t reordered_opaque;
-     int64_t pkt_pos;
-     int64_t pkt_duration;
-@@ -62,6 +64,14 @@ typedef struct pts_stats_s
-     int64_t guess;
- } pts_stats_t;
- 
-+typedef struct xlat_track_s {
-+    unsigned int track_no;
-+    int64_t last_pts;
-+    int64_t last_pkt_dts;
-+    int64_t last_opaque;
-+    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
-+} xlat_track_t;
-+
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-     int fd;
-@@ -96,10 +106,7 @@ typedef struct V4L2m2mContext {
-     int output_drm;
- 
-     /* Frame tracking */
--    int64_t last_pkt_dts;
--    int64_t last_opaque;
--    unsigned int track_no;
--    V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
-+    xlat_track_t xlat;
- 
-     pts_stats_t pts_stat;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 48a6810d18..d8ebb466cd 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -242,22 +242,24 @@ static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts
- // buffer of all the things we want preserved (including the original PTS)
- // indexed by the tracking no.
- static void
--xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *const avpkt)
-+xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
- {
-     int64_t track_pts;
- 
-     // Avoid 0
--    if (++s->track_no == 0)
--        s->track_no = 1;
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
- 
--    track_pts = track_to_pts(avctx, s->track_no);
-+    track_pts = track_to_pts(avctx, x->track_no);
- 
--    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, s->track_no);
--    s->last_pkt_dts = avpkt->dts;
--    s->track_els[s->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-+    x->last_pkt_dts = avpkt->dts;
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-         .discard          = 0,
-+        .pending          = 1,
-         .pkt_size         = avpkt->size,
-         .pts              = avpkt->pts,
-+        .dts              = avpkt->dts,
-         .reordered_opaque = avctx->reordered_opaque,
-         .pkt_pos          = avpkt->pos,
-         .pkt_duration     = avpkt->duration,
-@@ -268,31 +270,36 @@ xlat_pts_in(AVCodecContext *const avctx, V4L2m2mContext *const s, AVPacket *cons
- 
- // Returns -1 if we should discard the frame
- static int
--xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *const frame)
-+xlat_pts_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
-+             pts_stats_t * const ps,
-+             AVFrame *const frame)
- {
-     unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
--    const V4L2m2mTrackEl *const t = s->track_els + n;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-     if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-     {
-         av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-         frame->pts              = AV_NOPTS_VALUE;
--        frame->pkt_dts          = s->last_pkt_dts;
--        frame->reordered_opaque = s->last_opaque;
-+        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->reordered_opaque = x->last_opaque;
-         frame->pkt_pos          = -1;
-         frame->pkt_duration     = 0;
-         frame->pkt_size         = -1;
-     }
-     else if (!t->discard)
-     {
--        frame->pts              = t->pts;
--        frame->pkt_dts          = s->last_pkt_dts;
-+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-+        frame->pkt_dts          = x->last_pkt_dts;
-         frame->reordered_opaque = t->reordered_opaque;
-         frame->pkt_pos          = t->pkt_pos;
-         frame->pkt_duration     = t->pkt_duration;
-         frame->pkt_size         = t->pkt_size;
- 
--        s->last_opaque = s->track_els[n].reordered_opaque;
--        s->track_els[n].pts = AV_NOPTS_VALUE;  // If we hit this again deny accurate knowledge of PTS
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (frame->pts != AV_NOPTS_VALUE)
-+            x->last_pts = frame->pts;
-+        t->pending = 0;
-     }
-     else
-     {
-@@ -300,14 +307,62 @@ xlat_pts_out(AVCodecContext *const avctx, V4L2m2mContext *const s, AVFrame *cons
-         return -1;
-     }
- 
--    pts_stats_add(&s->pts_stat, frame->pts);
-+    pts_stats_add(ps, frame->pts);
- 
--    frame->best_effort_timestamp = pts_stats_guess(&s->pts_stat);
-+    frame->best_effort_timestamp = pts_stats_guess(ps);
-     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
-     return 0;
- }
- 
-+static void
-+xlat_flush(xlat_track_t * const x)
-+{
-+    unsigned int i;
-+    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
-+        x->track_els[i].pending = 0;
-+        x->track_els[i].discard = 1;
-+    }
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+static void
-+xlat_init(xlat_track_t * const x)
-+{
-+    memset(x, 0, sizeof(*x));
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+static int
-+xlat_pending(const xlat_track_t * const x)
-+{
-+    unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
-+    unsigned int i;
-+    int r = 0;
-+    int64_t now = AV_NOPTS_VALUE;
-+
-+    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
-+        const V4L2m2mTrackEl * const t = x->track_els + n;
-+
-+        if (!t->pending)
-+            continue;
-+
-+        if (now == AV_NOPTS_VALUE)
-+            now = t->dts;
-+
-+        if (t->pts == AV_NOPTS_VALUE ||
-+            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
-+             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
-+            ++r;
-+    }
-+
-+    // If we never get any ideas about PTS vs DTS allow a lot more buffer
-+    if (now == AV_NOPTS_VALUE)
-+        r -= 16;
-+
-+    return r;
-+}
-+
- static inline int stream_started(const V4L2m2mContext * const s) {
-     return s->capture.streamon && s->output.streamon;
- }
-@@ -374,7 +429,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-             return ret;
-         }
- 
--        xlat_pts_in(avctx, s, &s->buf_pkt);
-+        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
-     }
- 
-     if ((ret = check_output_streamon(avctx, s)) != 0)
-@@ -417,6 +472,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
- 
-     do {
-+        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
-         src_rv = try_enqueue_src(avctx, s);
- 
-         // If we got a frame last time and we have nothing to enqueue then
-@@ -451,7 +507,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                            s->draining, s->capture.done, dst_rv);
- 
-                 // Go again if we got a frame that we need to discard
--            } while (dst_rv == 0 && xlat_pts_out(avctx, s, frame));
-+            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
-         }
- 
-         // Continue trying to enqueue packets if either
-@@ -550,6 +606,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if (ret < 0)
-         return ret;
- 
-+    xlat_init(&s->xlat);
-     pts_stats_init(&s->pts_stat, avctx, "decoder");
- 
-     capture = &s->capture;
-@@ -632,7 +689,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
-     V4L2m2mContext * const s = priv->context;
-     V4L2Context * const output = &s->output;
-     V4L2Context * const capture = &s->capture;
--    int ret, i;
-+    int ret;
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
- 
-@@ -646,8 +703,7 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
- 
-     // V4L2 makes no guarantees about whether decoded frames are flushed or not
-     // so mark all frames we are tracking to be discarded if they appear
--    for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i)
--        s->track_els[i].discard = 1;
-+    xlat_flush(&s->xlat);
- 
-     // resend extradata
-     s->extdata_sent = 0;
-
-From 6fae7b3f42c8e9e431a59323c0faa6c88fe951d9 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 15 Dec 2021 17:58:21 +0000
-Subject: [PATCH 034/136] Use pending tracking to reduce v4l2 latency
-
-If there are more than 5 pending decodes outstanding then add a small
-timeout to the capture poll to reduce the rate at which frames are
-added.
----
- libavcodec/v4l2_m2m_dec.c | 58 ++++++++++++++++++++++++---------------
- 1 file changed, 36 insertions(+), 22 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index d8ebb466cd..7e7e4729d0 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -370,16 +370,19 @@ static inline int stream_started(const V4L2m2mContext * const s) {
- #define NQ_OK        0
- #define NQ_Q_FULL    1
- #define NQ_SRC_EMPTY 2
--#define NQ_DRAINING  3
--#define NQ_DEAD      4
-+#define NQ_NONE      3
-+#define NQ_DRAINING  4
-+#define NQ_DEAD      5
- 
- #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
-+#define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
- 
- // AVERROR_EOF     Flushing an already flushed stream
- // -ve             Error (all errors except EOF are unexpected)
- // NQ_OK (0)       OK
- // NQ_Q_FULL       Dst full (retry if we think V4L2 Q has space now)
- // NQ_SRC_EMPTY    Src empty (do not retry)
-+// NQ_NONE         Enqueue not attempted
- // NQ_DRAINING     At EOS, dQ dest until EOS there too
- // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
- 
-@@ -468,23 +471,28 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    int src_rv;
-+    int src_rv = NQ_NONE;
-     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-+    unsigned int i = 0;
- 
-     do {
--        av_log(avctx, AV_LOG_INFO, "Pending=%d\n", xlat_pending(&s->xlat));
--        src_rv = try_enqueue_src(avctx, s);
--
--        // If we got a frame last time and we have nothing to enqueue then
--        // return now. rv will be AVERROR(EAGAIN) indicating that we want more input
--        // This should mean that once decode starts we enter a stable state where
--        // we alternately ask for input and produce output
--        if (s->req_pkt && src_rv == NQ_SRC_EMPTY)
--            break;
--
--        if (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) {
--            av_log(avctx, AV_LOG_WARNING, "Poll says src Q has space but enqueue fail");
--            src_rv = NQ_SRC_EMPTY;  // If we can't enqueue pretend that there is nothing to enqueue
-+        const int pending = xlat_pending(&s->xlat);
-+        const int prefer_dq = (pending > 5);
-+
-+        // Enqueue another pkt for decode if
-+        // (a) We don't have a lot of stuff in the buffer already OR
-+        // (b) ... we (think we) do but we've failed to get a frame already OR
-+        // (c) We've dequeued a lot of frames without asking for input
-+        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
-+            src_rv = try_enqueue_src(avctx, s);
-+
-+            // If we got a frame last time or we've already tried to get a frame and
-+            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
-+            // indicating that we want more input.
-+            // This should mean that once decode starts we enter a stable state where
-+            // we alternately ask for input and produce output
-+            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-+                break;
-         }
- 
-         // Try to get a new frame if
-@@ -495,9 +503,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 // Dequeue frame will unref any previous contents of frame
-                 // if it returns success so we don't need an explicit unref
-                 // when discarding
--                // This returns AVERROR(EAGAIN) if there isn't a frame ready yet
--                // but there is room in the input Q
--                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, src_rv == NQ_Q_FULL ? 100 : -1);
-+                // This returns AVERROR(EAGAIN) on timeout or if
-+                // there is room in the input Q and timeout == -1
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);
- 
-                 if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-@@ -510,10 +518,16 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-             } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
-         }
- 
-+        ++i;
-+        if (i >= 256) {
-+            av_log(avctx, AV_LOG_ERROR, "Unexpectedly large retry count: %d\n", i);
-+            src_rv = AVERROR(EIO);
-+        }
-+
-         // Continue trying to enqueue packets if either
-         // (a) we succeeded last time OR
--        // (b) enqueue failed due to input Q full AND there is now room
--    } while (src_rv == NQ_OK || (src_rv == NQ_Q_FULL && dst_rv == AVERROR(EAGAIN)) );
-+        // (b) we didn't ret a frame and we can retry the input
-+    } while (src_rv == NQ_OK || (dst_rv == AVERROR(EAGAIN) && RETRY_NQ(src_rv)));
- 
-     // Ensure that the frame contains nothing if we aren't returning a frame
-     // (might happen when discarding)
-@@ -521,7 +535,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         av_frame_unref(frame);
- 
-     // If we got a frame this time ask for a pkt next time
--    s->req_pkt = (dst_rv == 0);
-+    s->req_pkt = (dst_rv == 0) ? s->req_pkt + 1 : 0;
- 
- #if 0
-     if (dst_rv == 0)
-
-From 175abd2eb961a3718a660e1f9eda08b37b01b309 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 15 Dec 2021 12:23:54 +0000
-Subject: [PATCH 035/136] Allow logger() to take const ctx
-
----
- libavcodec/v4l2_buffers.c | 2 +-
- libavcodec/v4l2_context.c | 4 ++--
- 2 files changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 22da6bd722..39c0094aec 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -46,7 +46,7 @@ static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
-         container_of(buf->context, V4L2m2mContext, capture);
- }
- 
--static inline AVCodecContext *logger(V4L2Buffer *buf)
-+static inline AVCodecContext *logger(const V4L2Buffer * const buf)
- {
-     return buf_to_m2mctx(buf)->avctx;
- }
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 64540a37b3..d3df48aed4 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -43,14 +43,14 @@ struct v4l2_format_update {
-     int update_avfmt;
- };
- 
--static inline V4L2m2mContext *ctx_to_m2mctx(V4L2Context *ctx)
-+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
- {
-     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-         container_of(ctx, V4L2m2mContext, output) :
-         container_of(ctx, V4L2m2mContext, capture);
- }
- 
--static inline AVCodecContext *logger(V4L2Context *ctx)
-+static inline AVCodecContext *logger(const V4L2Context *ctx)
- {
-     return ctx_to_m2mctx(ctx)->avctx;
- }
-
-From 21d4f3f644c45084c621cb5aa577169bf5c15017 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 15 Dec 2021 13:00:27 +0000
-Subject: [PATCH 036/136] Track numbere of bufs qed with an atomic
-
-Safer and faster than counting status
----
- libavcodec/v4l2_buffers.c | 6 +++---
- libavcodec/v4l2_context.c | 3 ++-
- libavcodec/v4l2_context.h | 3 +--
- 3 files changed, 6 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 39c0094aec..2cf7be6632 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -922,6 +922,7 @@ fail:
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
- {
-     int ret;
-+    int qc;
- 
-     avbuf->buf.flags = avbuf->flags;
- 
-@@ -941,11 +942,10 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-         return AVERROR(err);
-     }
- 
--    ++avbuf->context->q_count;
-+    qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
-     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-            avbuf->context->name, avbuf->buf.index,
--           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec,
--           avbuf->context->q_count);
-+           avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
- 
-     avbuf->status = V4L2BUF_IN_DRIVER;
- 
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index d3df48aed4..268a057e53 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -599,7 +599,7 @@ static int v4l2_release_buffers(V4L2Context* ctx)
-                     "  2. drmIoctl(.., DRM_IOCTL_GEM_CLOSE,... )\n");
-         }
-     }
--    ctx->q_count = 0;
-+    atomic_store(&ctx->q_count, 0);
- 
-     return ret;
- }
-@@ -1019,6 +1019,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-     }
- 
-     ff_mutex_init(&ctx->lock, NULL);
-+    atomic_init(&ctx->q_count, 0);
- 
-     if (s->output_drm) {
-         AVHWFramesContext *hwframes;
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 4cc164886c..a4176448d5 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -110,8 +110,7 @@ typedef struct V4L2Context {
-     int no_pts_rescale;
- 
-     AVBufferRef *frames_ref;
--    int q_count;
--    int dq_count;
-+    atomic_int q_count;
-     struct ff_weak_link_master *wl_master;
- 
-     AVMutex lock;
-
-From b2fa4ab3d63924597b8c3659123b145a786a2c13 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 9 Dec 2021 12:01:25 +0000
-Subject: [PATCH 037/136] Clear pkt_buf on flush
-
----
- libavcodec/v4l2_m2m_dec.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 7e7e4729d0..09ec496351 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -715,6 +715,9 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
-     if (ret < 0)
-         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
- 
-+    // Clear any buffered input packet
-+    av_packet_unref(&s->buf_pkt);
-+
-     // V4L2 makes no guarantees about whether decoded frames are flushed or not
-     // so mark all frames we are tracking to be discarded if they appear
-     xlat_flush(&s->xlat);
-
-From 16cf94cb5e1d11f4c3a6b8a43557383ce78112e0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 15 Dec 2021 12:52:56 +0000
-Subject: [PATCH 038/136] Rework v4l2 buffer dequeue
-
----
- libavcodec/v4l2_context.c | 543 ++++++++++++++++++--------------------
- libavcodec/v4l2_context.h |   2 +
- libavcodec/v4l2_m2m.c     |   1 -
- libavcodec/v4l2_m2m.h     |  16 +-
- libavcodec/v4l2_m2m_dec.c | 138 ++++------
- 5 files changed, 327 insertions(+), 373 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 268a057e53..d765181645 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -73,19 +73,27 @@ static AVRational v4l2_get_sar(V4L2Context *ctx)
-     return sar;
- }
- 
--static inline unsigned int v4l2_resolution_changed(V4L2Context *ctx, struct v4l2_format *fmt2)
-+static inline int ctx_buffers_alloced(const V4L2Context * const ctx)
- {
--    struct v4l2_format *fmt1 = &ctx->format;
--    int ret =  V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
--        fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
--        fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
--        :
--        fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
--        fmt1->fmt.pix.height != fmt2->fmt.pix.height;
-+    return ctx->bufrefs != NULL;
-+}
-+
-+// Width/Height changed or we don't have an alloc in the first place?
-+static int ctx_resolution_changed(const V4L2Context *ctx, const struct v4l2_format *fmt2)
-+{
-+    const struct v4l2_format *fmt1 = &ctx->format;
-+    int ret = !ctx_buffers_alloced(ctx) ||
-+        (V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-+            fmt1->fmt.pix_mp.width != fmt2->fmt.pix_mp.width ||
-+            fmt1->fmt.pix_mp.height != fmt2->fmt.pix_mp.height
-+            :
-+            fmt1->fmt.pix.width != fmt2->fmt.pix.width ||
-+            fmt1->fmt.pix.height != fmt2->fmt.pix.height);
- 
-     if (ret)
--        av_log(logger(ctx), AV_LOG_DEBUG, "%s changed (%dx%d) -> (%dx%d)\n",
-+        av_log(logger(ctx), AV_LOG_DEBUG, "V4L2 %s changed: alloc=%d (%dx%d) -> (%dx%d)\n",
-             ctx->name,
-+            ctx_buffers_alloced(ctx),
-             ff_v4l2_get_format_width(fmt1), ff_v4l2_get_format_height(fmt1),
-             ff_v4l2_get_format_width(fmt2), ff_v4l2_get_format_height(fmt2));
- 
-@@ -167,10 +175,8 @@ static int do_source_change(V4L2m2mContext * const s)
- 
-     int ret;
-     int reinit;
--    int full_reinit;
-     struct v4l2_format cap_fmt = s->capture.format;
- 
--    s->resize_pending = 0;
-     s->capture.done = 0;
- 
-     ret = ioctl(s->fd, VIDIOC_G_FMT, &cap_fmt);
-@@ -179,15 +185,21 @@ static int do_source_change(V4L2m2mContext * const s)
-         return 0;
-     }
- 
--    s->output.sample_aspect_ratio = v4l2_get_sar(&s->output);
--
-     get_default_selection(&s->capture, &s->capture.selection);
- 
--    reinit = v4l2_resolution_changed(&s->capture, &cap_fmt);
-+    reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
-+    s->capture.format = cap_fmt;
-     if (reinit) {
-         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
-         s->capture.width = ff_v4l2_get_format_width(&cap_fmt);
-     }
-+
-+    // If we don't support selection (or it is bust) and we obviously have HD then kludge
-+    if ((s->capture.selection.width == 0 || s->capture.selection.height == 0) &&
-+        (s->capture.height == 1088 && s->capture.width == 1920)) {
-+        s->capture.selection = (struct v4l2_rect){.width = 1920, .height = 1080};
-+    }
-+
-     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
- 
-     av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
-@@ -195,11 +207,11 @@ static int do_source_change(V4L2m2mContext * const s)
-            s->capture.selection.width, s->capture.selection.height,
-            s->capture.selection.left, s->capture.selection.top);
- 
--    s->reinit = 1;
--
-     if (reinit) {
-         if (avctx)
--            ret = ff_set_dimensions(s->avctx, s->capture.width, s->capture.height);
-+            ret = ff_set_dimensions(s->avctx,
-+                                    s->capture.selection.width != 0 ? s->capture.selection.width : s->capture.width,
-+                                    s->capture.selection.height != 0 ? s->capture.selection.height : s->capture.height);
-         if (ret < 0)
-             av_log(avctx, AV_LOG_WARNING, "update avcodec height and width failed\n");
- 
-@@ -208,11 +220,22 @@ static int do_source_change(V4L2m2mContext * const s)
-             av_log(avctx, AV_LOG_ERROR, "v4l2_m2m_codec_reinit failed\n");
-             return AVERROR(EINVAL);
-         }
-+
-+        // Update pixel format - should only actually do something on initial change
-+        s->capture.av_pix_fmt =
-+        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
-+        if (s->output_drm) {
-+            avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-+            avctx->sw_pix_fmt = s->capture.av_pix_fmt;
-+        }
-+        else
-+            avctx->pix_fmt = s->capture.av_pix_fmt;
-+
-         goto reinit_run;
-     }
- 
-     /* Buffers are OK so just stream off to ack */
--    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only\n", __func__);
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Parameters only - restart decode\n", __func__);
- 
-     ret = ff_v4l2_context_set_status(&s->capture, VIDIOC_STREAMOFF);
-     if (ret)
-@@ -225,54 +248,6 @@ reinit_run:
-     return 1;
- }
- 
--static int ctx_done(V4L2Context * const ctx)
--{
--    int rv = 0;
--    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
--
--    ctx->done = 1;
--
--    if (s->resize_pending && !V4L2_TYPE_IS_OUTPUT(ctx->type))
--        rv = do_source_change(s);
--
--    return rv;
--}
--
--/**
-- * handle resolution change event and end of stream event
-- * returns 1 if reinit was successful, negative if it failed
-- * returns 0 if reinit was not executed
-- */
--static int v4l2_handle_event(V4L2Context *ctx)
--{
--    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
--    struct v4l2_event evt = { 0 };
--    int ret;
--
--    ret = ioctl(s->fd, VIDIOC_DQEVENT, &evt);
--    if (ret < 0) {
--        av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_DQEVENT\n", ctx->name);
--        return 0;
--    }
--
--    av_log(logger(ctx), AV_LOG_INFO, "Dq event %d\n", evt.type);
--
--    if (evt.type == V4L2_EVENT_EOS) {
--//        ctx->done = 1;
--        av_log(logger(ctx), AV_LOG_TRACE, "%s VIDIOC_EVENT_EOS\n", ctx->name);
--        return 0;
--    }
--
--    if (evt.type != V4L2_EVENT_SOURCE_CHANGE)
--        return 0;
--
--    s->resize_pending = 1;
--    if (!ctx->done)
--        return 0;
--
--    return do_source_change(s);
--}
--
- static int v4l2_stop_decode(V4L2Context *ctx)
- {
-     struct v4l2_decoder_cmd cmd = {
-@@ -313,243 +288,252 @@ static int v4l2_stop_encode(V4L2Context *ctx)
-     return 0;
- }
- 
--static int count_in_driver(const V4L2Context * const ctx)
-+// DQ a buffer
-+// Amalgamates all the various ways there are of signalling EOS/Event to
-+// generate a consistant EPIPE.
-+//
-+// Sets ctx->flag_last if next dq would produce EPIPE (i.e. stream has stopped)
-+//
-+// Returns:
-+//  0               Success
-+//  AVERROR(EPIPE)  Nothing more to read
-+//  *               AVERROR(..)
-+
-+ static int
-+dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
- {
--    int i;
--    int n = 0;
-+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
-+    AVCodecContext * const avctx = m->avctx;
-+    V4L2Buffer * avbuf;
-+    const int is_mp = V4L2_TYPE_IS_MULTIPLANAR(ctx->type);
- 
--    if (!ctx->bufrefs)
--        return -1;
--
--    for (i = 0; i < ctx->num_buffers; ++i) {
--        V4L2Buffer *const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
--        if (avbuf->status == V4L2BUF_IN_DRIVER)
--            ++n;
--    }
--    return n;
--}
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
- 
--static V4L2Buffer* v4l2_dequeue_v4l2buf(V4L2Context *ctx, int timeout)
--{
--    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
--    const int is_capture = !V4L2_TYPE_IS_OUTPUT(ctx->type);
--    struct v4l2_plane planes[VIDEO_MAX_PLANES];
--    struct v4l2_buffer buf = { 0 };
--    V4L2Buffer *avbuf;
--    struct pollfd pfd = {
--        .events =  POLLIN | POLLRDNORM | POLLPRI | POLLOUT | POLLWRNORM, /* default blocking capture */
--        .fd = ctx_to_m2mctx(ctx)->fd,
-+    struct v4l2_buffer buf = {
-+        .type = ctx->type,
-+        .memory = V4L2_MEMORY_MMAP,
-     };
--    int i, ret;
--    int no_rx_means_done = 0;
--
--    if (is_capture && ctx->bufrefs) {
--        for (i = 0; i < ctx->num_buffers; i++) {
--            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
--            if (avbuf->status == V4L2BUF_IN_DRIVER)
--                break;
--        }
--        if (i == ctx->num_buffers)
--            av_log(logger(ctx), AV_LOG_WARNING, "All capture buffers (%d) returned to "
--                                                "userspace. Increase num_capture_buffers "
--                                                "to prevent device deadlock or dropped "
--                                                "packets/frames.\n", i);
-+
-+    *ppavbuf = NULL;
-+
-+    if (ctx->flag_last)
-+        return AVERROR(EPIPE);
-+
-+    if (is_mp) {
-+        buf.length = VIDEO_MAX_PLANES;
-+        buf.m.planes = planes;
-     }
- 
--#if 0
--    // I think this is true but pointless
--    // we will get some other form of EOF signal
--
--    /* if we are draining and there are no more capture buffers queued in the driver we are done */
--    if (is_capture && ctx_to_m2mctx(ctx)->draining) {
--        for (i = 0; i < ctx->num_buffers; i++) {
--            /* capture buffer initialization happens during decode hence
--             * detection happens at runtime
--             */
--            if (!ctx->bufrefs)
--                break;
--
--            avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
--            if (avbuf->status == V4L2BUF_IN_DRIVER)
--                goto start;
-+    while (ioctl(m->fd, VIDIOC_DQBUF, &buf) != 0) {
-+        const int err = errno;
-+        av_assert0(AVERROR(err) < 0);
-+        if (err != EINTR) {
-+            av_log(avctx, AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
-+                ctx->name, av_err2str(AVERROR(err)));
-+
-+            if (err == EPIPE)
-+                ctx->flag_last = 1;
-+
-+            return AVERROR(err);
-         }
--        ctx->done = 1;
--        return NULL;
-     }
--#endif
--
--start:
--    if (is_capture) {
--        /* no need to listen to requests for more input while draining */
--        if (ctx_to_m2mctx(ctx)->draining || timeout > 0)
--            pfd.events =  POLLIN | POLLRDNORM | POLLPRI;
--    } else {
--        pfd.events =  POLLOUT | POLLWRNORM;
-+    atomic_fetch_sub(&ctx->q_count, 1);
-+
-+    avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
-+    avbuf->status = V4L2BUF_AVAILABLE;
-+    avbuf->buf = buf;
-+    if (is_mp) {
-+        memcpy(avbuf->planes, planes, sizeof(planes));
-+        avbuf->buf.m.planes = avbuf->planes;
-     }
--    no_rx_means_done = s->resize_pending && is_capture;
- 
--    for (;;) {
--        // If we have a resize pending then all buffers should be Qed
--        // With a resize pending we should be in drain but evidence suggests
--        // that not all decoders do this so poll to clear
--        int t2 = no_rx_means_done ? 0 : timeout < 0 ? 3000 : timeout;
--        const int e = pfd.events;
--
--        ret = poll(&pfd, 1, t2);
-+    if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
-+        // Zero length cap buffer return == EOS
-+        if ((is_mp ? buf.m.planes[0].bytesused : buf.bytesused) == 0) {
-+            av_log(avctx, AV_LOG_DEBUG, "Buffer empty - reQ\n");
- 
--        if (ret > 0)
--            break;
-+            // Must reQ so we don't leak
-+            // May not matter if the next thing we do is release all the
-+            // buffers but better to be tidy.
-+            ff_v4l2_buffer_enqueue(avbuf);
- 
--        if (ret < 0) {
--            int err = errno;
--            if (err == EINTR)
--                continue;
--            av_log(logger(ctx), AV_LOG_ERROR, "=== poll error %d (%s): events=%#x, cap buffers=%d\n",
--                   err, strerror(err),
--                   e, count_in_driver(ctx));
--            return NULL;
-+            ctx->flag_last = 1;
-+            return AVERROR(EPIPE);
-         }
- 
--        // ret == 0 (timeout)
--        if (no_rx_means_done) {
--            av_log(logger(ctx), AV_LOG_DEBUG, "Ctx done on timeout\n");
--            ret = ctx_done(ctx);
--            if (ret > 0)
--                goto start;
--        }
--        if (timeout == -1)
--            av_log(logger(ctx), AV_LOG_ERROR, "=== poll unexpected TIMEOUT: events=%#x, cap buffers=%d\n", e, count_in_driver(ctx));;
--        return NULL;
-+#ifdef V4L2_BUF_FLAG_LAST
-+        // If flag_last set then this contains data but is the last frame
-+        // so remember that but return OK
-+        if ((buf.flags & V4L2_BUF_FLAG_LAST) != 0)
-+            ctx->flag_last = 1;
-+#endif
-     }
- 
--    /* 0. handle errors */
--    if (pfd.revents & POLLERR) {
--        /* if we are trying to get free buffers but none have been queued yet
--           no need to raise a warning */
--        if (timeout == 0) {
--            for (i = 0; i < ctx->num_buffers; i++) {
--                avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
--                if (avbuf->status != V4L2BUF_AVAILABLE)
--                    av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
--            }
--        }
--        else
--            av_log(logger(ctx), AV_LOG_WARNING, "%s POLLERR\n", ctx->name);
-+    *ppavbuf = avbuf;
-+    return 0;
-+}
- 
--        return NULL;
--    }
-+/**
-+ * handle resolution change event and end of stream event
-+ * Expects to be called after the stream has stopped
-+ *
-+ * returns 1 if reinit was successful, negative if it failed
-+ * returns 0 if reinit was not executed
-+ */
-+static int
-+get_event(V4L2m2mContext * const m)
-+{
-+    AVCodecContext * const avctx = m->avctx;
-+    struct v4l2_event evt = { 0 };
- 
--    /* 1. handle resolution changes */
--    if (pfd.revents & POLLPRI) {
--        ret = v4l2_handle_event(ctx);
--        if (ret < 0) {
--            /* if re-init failed, abort */
--            ctx->done = 1;
--            return NULL;
-+    while (ioctl(m->fd, VIDIOC_DQEVENT, &evt) != 0) {
-+        const int rv = AVERROR(errno);
-+        if (rv == AVERROR(EINTR))
-+            continue;
-+        if (rv == AVERROR(EAGAIN)) {
-+            av_log(avctx, AV_LOG_WARNING, "V4L2 failed to get expected event - assume EOS\n");
-+            return AVERROR_EOF;
-         }
--        if (ret > 0)
--            goto start;
-+        av_log(avctx, AV_LOG_ERROR, "V4L2 VIDIOC_DQEVENT: %s\n", av_err2str(rv));
-+        return rv;
-+    }
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Dq event %d\n", evt.type);
-+
-+    if (evt.type == V4L2_EVENT_EOS) {
-+        av_log(avctx, AV_LOG_TRACE, "V4L2 VIDIOC_EVENT_EOS\n");
-+        return AVERROR_EOF;
-     }
- 
--    /* 2. dequeue the buffer */
--    if (pfd.revents & (POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM)) {
-+    if (evt.type == V4L2_EVENT_SOURCE_CHANGE)
-+        return do_source_change(m);
- 
--        if (is_capture) {
--            /* there is a capture buffer ready */
--            if (pfd.revents & (POLLIN | POLLRDNORM))
--                goto dequeue;
-+    return 0;
-+}
- 
--            // CAPTURE Q drained
--            if (no_rx_means_done) {
--                if (ctx_done(ctx) > 0)
--                    goto start;
--                return NULL;
--            }
- 
--            /* the driver is ready to accept more input; instead of waiting for the capture
--             * buffer to complete we return NULL so input can proceed (we are single threaded)
--             */
--            if (pfd.revents & (POLLOUT | POLLWRNORM))
--                return NULL;
-+// Get a buffer
-+// If output then just gets the buffer in the expected way
-+// If capture then runs the capture state m/c to deal with res change etc.
-+// If return value == 0 then *ppavbuf != NULL
-+
-+static int
-+get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout)
-+{
-+    V4L2m2mContext * const m = ctx_to_m2mctx(ctx);
-+    AVCodecContext * const avctx = m->avctx;
-+    const int is_cap = V4L2_TYPE_IS_CAPTURE(ctx->type);
-+
-+    const unsigned int poll_cap = (POLLIN | POLLRDNORM);
-+    const unsigned int poll_out = (POLLOUT | POLLWRNORM);
-+    const unsigned int poll_event = POLLPRI;
-+
-+    *ppavbuf = NULL;
-+
-+    for (;;) {
-+        struct pollfd pfd = {
-+            .fd = m->fd,
-+            // If capture && stream not started then assume we are waiting for the initial event
-+            .events = !is_cap ? poll_out :
-+                !ff_v4l2_ctx_eos(ctx) && ctx->streamon ? poll_cap :
-+                    poll_event,
-+        };
-+        int ret;
-+
-+        if (ctx->done) {
-+            av_log(avctx, AV_LOG_TRACE, "V4L2 %s already done\n", ctx->name);
-+            return AVERROR_EOF;
-         }
- 
--dequeue:
--        memset(&buf, 0, sizeof(buf));
--        buf.memory = V4L2_MEMORY_MMAP;
--        buf.type = ctx->type;
--        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--            memset(planes, 0, sizeof(planes));
--            buf.length = VIDEO_MAX_PLANES;
--            buf.m.planes = planes;
-+        // If capture && timeout == -1 then also wait for rx buffer free
-+        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
-+            pfd.events |= poll_out;
-+
-+        // If nothing Qed all we will get is POLLERR - avoid that
-+        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
-+            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
-+            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
-+            av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-+            return AVERROR(EAGAIN);
-         }
- 
--        while ((ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_DQBUF, &buf)) == -1) {
--            const int err = errno;
--            if (err == EINTR)
-+        // Timeout kludged s.t. "forever" eventually gives up & produces logging
-+        // If waiting for an event when we have seen a last_frame then we expect
-+        //   it to be ready already so force a short timeout
-+        ret = poll(&pfd, 1,
-+                   ff_v4l2_ctx_eos(ctx) ? 10 :
-+                   timeout == -1 ? 3000 : timeout);
-+        if (ret < 0) {
-+            ret = AVERROR(errno);  // Remember errno before logging etc.
-+            av_assert0(ret < 0);
-+        }
-+
-+        av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s ret=%d, timeout=%d, events=%#x, revents=%#x\n",
-+               ctx->name, ret, timeout, pfd.events, pfd.revents);
-+
-+        if (ret < 0) {
-+            if (ret == AVERROR(EINTR))
-                 continue;
--            if (err != EAGAIN) {
--                // EPIPE on CAPTURE can be used instead of BUF_FLAG_LAST
--                if (err != EPIPE || !is_capture)
--                    av_log(logger(ctx), AV_LOG_DEBUG, "%s VIDIOC_DQBUF, errno (%s)\n",
--                        ctx->name, av_err2str(AVERROR(err)));
--                if (ctx_done(ctx) > 0)
--                    goto start;
-+            av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll error %d (%s)\n", ctx->name, AVUNERROR(ret), av_err2str(ret));
-+            return ret;
-+        }
-+
-+        if (ret == 0) {
-+            if (timeout == -1)
-+                av_log(avctx, AV_LOG_ERROR, "V4L2 %s poll unexpected timeout: events=%#x\n", ctx->name, pfd.events);
-+            if (ff_v4l2_ctx_eos(ctx)) {
-+                av_log(avctx, AV_LOG_WARNING, "V4L2 %s poll event timeout\n", ctx->name);
-+                ret = get_event(m);
-+                if (ret < 0) {
-+                    ctx->done = 1;
-+                    return ret;
-+                }
-             }
--            return NULL;
-+            return AVERROR(EAGAIN);
-         }
--        --ctx->q_count;
--        av_log(logger(ctx), AV_LOG_DEBUG, "--- %s VIDIOC_DQBUF OK: index=%d, ts=%ld.%06ld, count=%d, dq=%d field=%d\n",
--               ctx->name, buf.index,
--               buf.timestamp.tv_sec, buf.timestamp.tv_usec,
--               ctx->q_count, ++ctx->dq_count, buf.field);
--
--        avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
--        avbuf->status = V4L2BUF_AVAILABLE;
--        avbuf->buf = buf;
--        if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
--            memcpy(avbuf->planes, planes, sizeof(planes));
--            avbuf->buf.m.planes = avbuf->planes;
-+
-+        if ((pfd.revents & POLLERR) != 0) {
-+            av_log(avctx, AV_LOG_WARNING, "V4L2 %s POLLERR\n", ctx->name);
-+            return AVERROR_UNKNOWN;
-         }
- 
--        if (ctx_to_m2mctx(ctx)->draining && is_capture) {
--            int bytesused = V4L2_TYPE_IS_MULTIPLANAR(buf.type) ?
--                            buf.m.planes[0].bytesused : buf.bytesused;
--            if (bytesused == 0) {
--                av_log(logger(ctx), AV_LOG_DEBUG, "Buffer empty - reQ\n");
-+        if ((pfd.revents & poll_event) != 0) {
-+            ret = get_event(m);
-+            if (ret < 0) {
-+                ctx->done = 1;
-+                return ret;
-+            }
-+            continue;
-+        }
- 
--                // Must reQ so we don't leak
--                // May not matter if the next thing we do is release all the
--                // buffers but better to be tidy.
--                ff_v4l2_buffer_enqueue(avbuf);
-+        if ((pfd.revents & poll_cap) != 0) {
-+            ret = dq_buf(ctx, ppavbuf);
-+            if (ret == AVERROR(EPIPE))
-+                continue;
-+            return ret;
-+        }
- 
--                if (ctx_done(ctx) > 0)
--                    goto start;
--                return NULL;
--            }
--#ifdef V4L2_BUF_FLAG_LAST
--            if (buf.flags & V4L2_BUF_FLAG_LAST) {
--                av_log(logger(ctx), AV_LOG_TRACE, "FLAG_LAST set\n");
--                avbuf->status = V4L2BUF_IN_USE;  // Avoid flushing this buffer
--                ctx_done(ctx);
--            }
--#endif
-+        if ((pfd.revents & poll_out) != 0) {
-+            if (is_cap)
-+                return AVERROR(EAGAIN);
-+            return dq_buf(ctx, ppavbuf);
-         }
- 
--        return avbuf;
-+        av_log(avctx, AV_LOG_ERROR, "V4L2 poll unexpected events=%#x, revents=%#x\n", pfd.events, pfd.revents);
-+        return AVERROR_UNKNOWN;
-     }
--
--    return NULL;
- }
- 
- static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- {
--    int timeout = 0; /* return when no more buffers to dequeue */
-     int i;
- 
-     /* get back as many output buffers as possible */
-     if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--          do {
--          } while (v4l2_dequeue_v4l2buf(ctx, timeout));
-+        V4L2Buffer * avbuf;
-+        do {
-+            get_qbuf(ctx, &avbuf, 0);
-+        } while (avbuf);
-     }
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
-@@ -722,7 +706,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
-         if (buf->status == V4L2BUF_IN_DRIVER)
-             buf->status = V4L2BUF_AVAILABLE;
-     }
--    ctx->q_count = 0;
-+    atomic_store(&ctx->q_count, 0);
- }
- 
- static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
-@@ -755,6 +739,10 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
-     int ret;
-     AVCodecContext * const avctx = logger(ctx);
- 
-+    // Avoid doing anything if there is nothing we can do
-+    if (cmd == VIDIOC_STREAMOFF && !ctx_buffers_alloced(ctx) && !ctx->streamon)
-+        return 0;
-+
-     ff_mutex_lock(&ctx->lock);
- 
-     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-@@ -777,6 +765,9 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
-                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF");
-     }
- 
-+    // Both stream off & on effectively clear flag_last
-+    ctx->flag_last = 0;
-+
-     ff_mutex_unlock(&ctx->lock);
- 
-     return ret;
-@@ -840,19 +831,10 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
- {
-     V4L2Buffer *avbuf;
-+    int rv;
- 
--    /*
--     * timeout=-1 blocks until:
--     *  1. decoded frame available
--     *  2. an input buffer is ready to be dequeued
--     */
--    avbuf = v4l2_dequeue_v4l2buf(ctx, timeout);
--    if (!avbuf) {
--        if (ctx->done)
--            return AVERROR_EOF;
--
--        return AVERROR(EAGAIN);
--    }
-+    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-+        return rv;
- 
-     return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
- }
-@@ -860,19 +842,10 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
- {
-     V4L2Buffer *avbuf;
-+    int rv;
- 
--    /*
--     * blocks until:
--     *  1. encoded packet available
--     *  2. an input buffer ready to be dequeued
--     */
--    avbuf = v4l2_dequeue_v4l2buf(ctx, -1);
--    if (!avbuf) {
--        if (ctx->done)
--            return AVERROR_EOF;
--
--        return AVERROR(EAGAIN);
--    }
-+    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+        return rv;
- 
-     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
- }
-@@ -956,6 +929,8 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
-     int ret;
-     int i;
- 
-+    av_assert0(ctx->bufrefs == NULL);
-+
-     memset(&req, 0, sizeof(req));
-     req.count = req_buffers;
-     req.memory = V4L2_MEMORY_MMAP;
-@@ -1033,8 +1008,8 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         hwframes = (AVHWFramesContext*)ctx->frames_ref->data;
-         hwframes->format = AV_PIX_FMT_DRM_PRIME;
-         hwframes->sw_format = ctx->av_pix_fmt;
--        hwframes->width = ctx->width;
--        hwframes->height = ctx->height;
-+        hwframes->width = ctx->width != 0 ? ctx->width : s->avctx->width;
-+        hwframes->height = ctx->height != 0 ? ctx->height : s->avctx->height;
-         ret = av_hwframe_ctx_init(ctx->frames_ref);
-         if (ret < 0)
-             goto fail_unref_hwframes;
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index a4176448d5..565858a1ed 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -102,6 +102,8 @@ typedef struct V4L2Context {
-      */
-     int done;
- 
-+    int flag_last;
-+
-     /**
-      * PTS rescale not wanted
-      * If the PTS is just a dummy frame count then rescale is
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 516e6d9858..e26bd74c3e 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -235,7 +235,6 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *s)
- 
-     /* 5. complete reinit */
-     s->draining = 0;
--    s->reinit = 0;
- 
-     return 0;
- }
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 3f86809623..d71f6b721c 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -84,8 +84,6 @@ typedef struct V4L2m2mContext {
-     AVCodecContext *avctx;
-     sem_t refsync;
-     atomic_uint refcount;
--    int reinit;
--    int resize_pending;
- 
-     /* null frame/packet received */
-     int draining;
-@@ -180,15 +178,25 @@ int ff_v4l2_m2m_codec_reinit(V4L2m2mContext *ctx);
- int ff_v4l2_m2m_codec_full_reinit(V4L2m2mContext *ctx);
- 
- 
--static inline unsigned int ff_v4l2_get_format_width(struct v4l2_format *fmt)
-+static inline unsigned int ff_v4l2_get_format_width(const struct v4l2_format * const fmt)
- {
-     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
- }
- 
--static inline unsigned int ff_v4l2_get_format_height(struct v4l2_format *fmt)
-+static inline unsigned int ff_v4l2_get_format_height(const struct v4l2_format * const fmt)
- {
-     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
- }
- 
-+static inline uint32_t ff_v4l2_get_format_pixelformat(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
-+}
-+
-+static inline int ff_v4l2_ctx_eos(const V4L2Context * const ctx)
-+{
-+    return ctx->flag_last;
-+}
-+
- 
- #endif /* AVCODEC_V4L2_M2M_H */
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 09ec496351..e4b6569ba5 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -113,9 +113,6 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
-     if (ret < 0)
-         av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
- 
--    if (!s->capture.streamon || ret < 0)
--        return ret;
--
-     ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
-     if (ret < 0)
-         av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
-@@ -127,69 +124,12 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
- 
- static int v4l2_try_start(AVCodecContext *avctx)
- {
--    V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    V4L2Context *const capture = &s->capture;
--    struct v4l2_selection selection = { 0 };
-+    V4L2m2mContext * const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     int ret;
- 
-     /* 1. start the output process */
-     if ((ret = check_output_streamon(avctx, s)) != 0)
-         return ret;
--
--    if (capture->streamon)
--        return 0;
--
--    /* 2. get the capture format */
--    capture->format.type = capture->type;
--    ret = ioctl(s->fd, VIDIOC_G_FMT, &capture->format);
--    if (ret) {
--        av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_FMT ioctl\n");
--        return ret;
--    }
--
--    /* 2.1 update the AVCodecContext */
--    capture->av_pix_fmt =
--        ff_v4l2_format_v4l2_to_avfmt(capture->format.fmt.pix_mp.pixelformat, AV_CODEC_ID_RAWVIDEO);
--    if (s->output_drm) {
--        avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
--        avctx->sw_pix_fmt = capture->av_pix_fmt;
--    }
--    else
--        avctx->pix_fmt = capture->av_pix_fmt;
--
--    /* 3. set the crop parameters */
--#if 1
--    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
--    selection.target = V4L2_SEL_TGT_CROP_DEFAULT;
--    ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
--    av_log(avctx, AV_LOG_INFO, "Post G selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
--#else
--    selection.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
--    selection.r.height = avctx->coded_height;
--    selection.r.width = avctx->coded_width;
--    av_log(avctx, AV_LOG_INFO, "Try selection %dx%d\n", avctx->coded_width, avctx->coded_height);
--    ret = ioctl(s->fd, VIDIOC_S_SELECTION, &selection);
--    av_log(avctx, AV_LOG_INFO, "Post S selection ret=%d, err=%d %dx%d\n", ret, errno, selection.r.width, selection.r.height);
--    if (1) {
--        ret = ioctl(s->fd, VIDIOC_G_SELECTION, &selection);
--        if (ret) {
--            av_log(avctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION ioctl\n");
--        } else {
--            av_log(avctx, AV_LOG_DEBUG, "crop output %dx%d\n", selection.r.width, selection.r.height);
--            /* update the size of the resulting frame */
--            capture->height = selection.r.height;
--            capture->width  = selection.r.width;
--        }
--    }
--#endif
--
--    /* 5. start the capture process */
--    ret = ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
--    if (ret) {
--        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_STREAMON, on capture context\n");
--        return ret;
--    }
--
-     return 0;
- }
- 
-@@ -364,7 +304,7 @@ xlat_pending(const xlat_track_t * const x)
- }
- 
- static inline int stream_started(const V4L2m2mContext * const s) {
--    return s->capture.streamon && s->output.streamon;
-+    return s->output.streamon;
- }
- 
- #define NQ_OK        0
-@@ -377,6 +317,9 @@ static inline int stream_started(const V4L2m2mContext * const s) {
- #define TRY_DQ(nq_status) ((nq_status) >= NQ_OK && (nq_status) <= NQ_DRAINING)
- #define RETRY_NQ(nq_status) ((nq_status) == NQ_Q_FULL || (nq_status) == NQ_NONE)
- 
-+// do_not_get      If true then no new packet will be got but status will
-+//                  be set appropriately
-+
- // AVERROR_EOF     Flushing an already flushed stream
- // -ve             Error (all errors except EOF are unexpected)
- // NQ_OK (0)       OK
-@@ -386,14 +329,14 @@ static inline int stream_started(const V4L2m2mContext * const s) {
- // NQ_DRAINING     At EOS, dQ dest until EOS there too
- // NQ_DEAD         Not running (do not retry, do not attempt capture dQ)
- 
--static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const s, const int do_not_get)
- {
-     int ret;
- 
-     // If we don't already have a coded packet - get a new one
-     // We will already have a coded pkt if the output Q was full last time we
-     // tried to Q it
--    if (!s->buf_pkt.size) {
-+    if (!s->buf_pkt.size && !do_not_get) {
-         ret = ff_decode_get_packet(avctx, &s->buf_pkt);
- 
-         if (ret == AVERROR(EAGAIN)) {
-@@ -435,6 +378,17 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-         xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
-     }
- 
-+    if (s->draining) {
-+        if (s->buf_pkt.size) {
-+            av_log(avctx, AV_LOG_WARNING, "Unexpected input whilst draining\n");
-+            av_packet_unref(&s->buf_pkt);
-+        }
-+        return NQ_DRAINING;
-+    }
-+
-+    if (!s->buf_pkt.size)
-+        return NQ_NONE;
-+
-     if ((ret = check_output_streamon(avctx, s)) != 0)
-         return ret;
- 
-@@ -471,7 +425,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    int src_rv = NQ_NONE;
-+    int src_rv;
-     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-     unsigned int i = 0;
- 
-@@ -483,31 +437,40 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         // (a) We don't have a lot of stuff in the buffer already OR
-         // (b) ... we (think we) do but we've failed to get a frame already OR
-         // (c) We've dequeued a lot of frames without asking for input
--        if (!prefer_dq || i != 0 || s->req_pkt > 2) {
--            src_rv = try_enqueue_src(avctx, s);
--
--            // If we got a frame last time or we've already tried to get a frame and
--            // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
--            // indicating that we want more input.
--            // This should mean that once decode starts we enter a stable state where
--            // we alternately ask for input and produce output
--            if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
--                break;
--        }
-+        src_rv = try_enqueue_src(avctx, s, !(!prefer_dq || i != 0 || s->req_pkt > 2));
-+
-+        // If we got a frame last time or we've already tried to get a frame and
-+        // we have nothing to enqueue then return now. rv will be AVERROR(EAGAIN)
-+        // indicating that we want more input.
-+        // This should mean that once decode starts we enter a stable state where
-+        // we alternately ask for input and produce output
-+        if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-+            break;
- 
-         // Try to get a new frame if
-         // (a) we haven't already got one AND
-         // (b) enqueue returned a status indicating that decode should be attempted
-         if (dst_rv != 0 && TRY_DQ(src_rv)) {
-+            // Pick a timeout depending on state
-+            const int t =
-+                src_rv == NQ_DRAINING ? 300 :
-+                prefer_dq ? 5 :
-+                src_rv == NQ_Q_FULL ? -1 : 0;
-+
-             do {
-                 // Dequeue frame will unref any previous contents of frame
-                 // if it returns success so we don't need an explicit unref
-                 // when discarding
-                 // This returns AVERROR(EAGAIN) on timeout or if
-                 // there is room in the input Q and timeout == -1
--                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, prefer_dq ? 5 : -1);
-+                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
- 
--                if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-+                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-+                    dst_rv = AVERROR_EOF;
-+                    s->capture.done = 1;
-+                }
-+                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-                     av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-                            s->draining, s->capture.done);
-                 else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-@@ -630,8 +593,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-      * by the v4l2 driver; this event will trigger a full pipeline reconfig and
-      * the proper values will be retrieved from the kernel driver.
-      */
--    output->height = capture->height = avctx->coded_height;
--    output->width = capture->width = avctx->coded_width;
-+//    output->height = capture->height = avctx->coded_height;
-+//    output->width = capture->width = avctx->coded_width;
-+    output->height = capture->height = 0;
-+    output->width = capture->width = 0;
- 
-     output->av_codec_id = avctx->codec_id;
-     output->av_pix_fmt  = AV_PIX_FMT_NONE;
-@@ -703,7 +668,6 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
-     V4L2m2mContext * const s = priv->context;
-     V4L2Context * const output = &s->output;
-     V4L2Context * const capture = &s->capture;
--    int ret;
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s: streamon=%d\n", __func__, output->streamon);
- 
-@@ -711,13 +675,19 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
-     // states like EOS processing so don't try to optimize out (having got it
-     // wrong once)
- 
--    ret = ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
--    if (ret < 0)
--        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMOFF %s error: %d\n", output->name, ret);
-+    ff_v4l2_context_set_status(output, VIDIOC_STREAMOFF);
- 
-     // Clear any buffered input packet
-     av_packet_unref(&s->buf_pkt);
- 
-+    // Clear a pending EOS
-+    if (ff_v4l2_ctx_eos(capture)) {
-+        // Arguably we could delay this but this is easy and doesn't require
-+        // thought or extra vars
-+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMOFF);
-+        ff_v4l2_context_set_status(capture, VIDIOC_STREAMON);
-+    }
-+
-     // V4L2 makes no guarantees about whether decoded frames are flushed or not
-     // so mark all frames we are tracking to be discarded if they appear
-     xlat_flush(&s->xlat);
-
-From a2519f7a512edde7433aced70de4464e21805693 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 9 Dec 2021 18:51:00 +0000
-Subject: [PATCH 039/136] Honor result of ff_get_format if possible
-
----
- libavcodec/v4l2_m2m_dec.c | 6 +++++-
- 1 file changed, 5 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index e4b6569ba5..c9655bcc3b 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -615,15 +615,19 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-      *       check the v4l2_get_drm_frame function.
-      */
- 
-+    avctx->sw_pix_fmt = avctx->pix_fmt;
-     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
-     av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
-            avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
- 
--    s->output_drm = 0;
-     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-         s->output_drm = 1;
-     }
-+    else {
-+        capture->av_pix_fmt = gf_pix_fmt;
-+        s->output_drm = 0;
-+    }
- 
-     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-     if (!s->device_ref) {
-
-From a1cd1cb98e48c631392b385ccac5ab7b09bb5ee9 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 14 Dec 2021 16:11:10 +0000
-Subject: [PATCH 040/136] Add an always-reinit quirk
-
----
- libavcodec/v4l2_context.c |  7 +++++--
- libavcodec/v4l2_m2m.h     |  5 +++++
- libavcodec/v4l2_m2m_dec.c | 33 ++++++++++++++++++++++++++++++++-
- 3 files changed, 42 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index d765181645..c11b5e6863 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -188,6 +188,9 @@ static int do_source_change(V4L2m2mContext * const s)
-     get_default_selection(&s->capture, &s->capture.selection);
- 
-     reinit = ctx_resolution_changed(&s->capture, &cap_fmt);
-+    if ((s->quirks & FF_V4L2_QUIRK_REINIT_ALWAYS) != 0)
-+        reinit = 1;
-+
-     s->capture.format = cap_fmt;
-     if (reinit) {
-         s->capture.height = ff_v4l2_get_format_height(&cap_fmt);
-@@ -202,10 +205,10 @@ static int do_source_change(V4L2m2mContext * const s)
- 
-     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
- 
--    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d\n",
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
-            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
-            s->capture.selection.width, s->capture.selection.height,
--           s->capture.selection.left, s->capture.selection.top);
-+           s->capture.selection.left, s->capture.selection.top, reinit);
- 
-     if (reinit) {
-         if (avctx)
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index d71f6b721c..f1923bb26d 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -113,6 +113,11 @@ typedef struct V4L2m2mContext {
- 
-     /* Ext data sent */
-     int extdata_sent;
-+
-+#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
-+    /* Quirks */
-+    unsigned int quirks;
-+
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv {
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index c9655bcc3b..e2b10f5e3a 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -540,6 +540,34 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- }
- #endif
- 
-+static int
-+get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+{
-+    struct v4l2_capability cap;
-+
-+    memset(&cap, 0, sizeof(cap));
-+    while (ioctl(s->fd, VIDIOC_QUERYCAP, &cap) != 0) {
-+        int err = errno;
-+        if (err == EINTR)
-+            continue;
-+        av_log(avctx, AV_LOG_ERROR, "V4L2: Failed to get capabilities: %s\n", strerror(err));
-+        return AVERROR(err);
-+    }
-+
-+    // Could be made table driven if we have a few more but right now there
-+    // seems no point
-+
-+    // Meson (amlogic) always gives a resolution changed event after output
-+    // streamon and userspace must (re)allocate capture buffers and streamon
-+    // capture to clear the event even if the capture buffers were the right
-+    // size in the first place.
-+    if (strcmp(cap.driver, "meson-vdec") == 0)
-+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
-+
-+    av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
-+    return 0;
-+}
-+
- // This heuristic is for H264 but use for everything
- static uint32_t max_coded_size(const AVCodecContext * const avctx)
- {
-@@ -646,7 +674,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         return ret;
-     }
- 
--    return v4l2_prepare_decoder(s);
-+    if ((ret = v4l2_prepare_decoder(s)) < 0)
-+        return ret;
-+
-+    return get_quirks(avctx, s);
- }
- 
- static av_cold int v4l2_decode_close(AVCodecContext *avctx)
-
-From 2470968adf0d28bbaf310e782720dd00d57d7bf6 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 4 Jan 2022 16:58:31 +0000
-Subject: [PATCH 041/136] v4l2_buffers: rework flags for keyframe
-
-Previously flags could become confused and keyframe info could be lost.
-This fixes that and removes the duplicate flags field in V4L2Buffer.
----
- libavcodec/v4l2_buffers.c | 15 ++++++++++-----
- libavcodec/v4l2_buffers.h |  1 -
- libavcodec/v4l2_context.c | 18 +++++++++++++++++-
- 3 files changed, 27 insertions(+), 7 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 2cf7be6632..62d1c26053 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -680,7 +680,9 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- 
- int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
--    out->buf.flags = frame->key_frame ? (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME) : (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME);
-+    out->buf.flags = frame->key_frame ?
-+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
-     // Beware that colour info is held in format rather than the actual
-     // v4l2 buffer struct so this may not be as useful as you might hope
-     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
-@@ -706,6 +708,10 @@ int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
- 
-     /* 2. get frame information */
-     frame->key_frame = !!(avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME);
-+    frame->pict_type = frame->key_frame ? AV_PICTURE_TYPE_I :
-+        (avbuf->buf.flags & V4L2_BUF_FLAG_PFRAME) != 0 ? AV_PICTURE_TYPE_P :
-+        (avbuf->buf.flags & V4L2_BUF_FLAG_BFRAME) != 0 ? AV_PICTURE_TYPE_B :
-+            AV_PICTURE_TYPE_NONE;
-     frame->color_primaries = v4l2_get_color_primaries(avbuf);
-     frame->colorspace = v4l2_get_color_space(avbuf);
-     frame->color_range = v4l2_get_color_range(avbuf);
-@@ -779,8 +785,9 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
- 
-     v4l2_set_pts(out, pkt->pts);
- 
--    if (pkt->flags & AV_PKT_FLAG_KEY)
--        out->flags = V4L2_BUF_FLAG_KEYFRAME;
-+    out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
-+        (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-+        (out->buf.flags & ~V4L2_BUF_FLAG_KEYFRAME);
- 
-     return ret;
- }
-@@ -924,8 +931,6 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-     int ret;
-     int qc;
- 
--    avbuf->buf.flags = avbuf->flags;
--
-     if (avbuf->buf.timestamp.tv_sec || avbuf->buf.timestamp.tv_usec) {
-         av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s pre VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-                avbuf->context->name, avbuf->buf.index,
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 641e0e147b..3b7ca4d99e 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -73,7 +73,6 @@ typedef struct V4L2Buffer {
-     struct v4l2_buffer buf;
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
- 
--    int flags;
-     enum V4L2Buffer_status status;
- 
- } V4L2Buffer;
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index c11b5e6863..53b522d43e 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -527,6 +527,22 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
-     }
- }
- 
-+// Clear out flags and timestamps that should should be set by the user
-+// Returns the passed avbuf
-+static V4L2Buffer *
-+clean_v4l2_buffer(V4L2Buffer * const avbuf)
-+{
-+    struct v4l2_buffer *const buf = &avbuf->buf;
-+
-+    buf->flags = 0;
-+    buf->field = V4L2_FIELD_ANY;
-+    buf->timestamp = (struct timeval){0};
-+    buf->timecode = (struct v4l2_timecode){0};
-+    buf->sequence = 0;
-+
-+    return avbuf;
-+}
-+
- static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- {
-     int i;
-@@ -542,7 +558,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
-     for (i = 0; i < ctx->num_buffers; i++) {
-         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-         if (avbuf->status == V4L2BUF_AVAILABLE)
--            return avbuf;
-+            return clean_v4l2_buffer(avbuf);
-     }
- 
-     return NULL;
-
-From 5dc38f5d088beea4da57e82969643cc831c40cf0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 22 Mar 2022 11:44:30 +0000
-Subject: [PATCH 042/136] v4l2m2m: Rework decode to wait for missing buffer,
- add dynamic pending
-
-Previously receive_frame exited with EAGAIN if no capture buffer
-availble in the Q.  Now it waits in the hope that another thread will
-post one.
-
-The prefer dQ logic is now dynamic to help with cases where PTS/DTS
-lies.  If it looks like we are never getting a frame then the
-threshold is increased.  It then slowly decays over time to cope with
-false alarms.
----
- libavcodec/v4l2_buffers.c |  6 +++--
- libavcodec/v4l2_context.c |  7 +++--
- libavcodec/v4l2_context.h |  3 +++
- libavcodec/v4l2_m2m.h     |  2 ++
- libavcodec/v4l2_m2m_dec.c | 57 +++++++++++++++++++++++++++++++++++++--
- 5 files changed, 69 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 62d1c26053..8c4f18dbed 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -947,12 +947,14 @@ int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf)
-         return AVERROR(err);
-     }
- 
-+    // Lock not wanted - if called from buffer free then lock already obtained
-     qc = atomic_fetch_add(&avbuf->context->q_count, 1) + 1;
-+    avbuf->status = V4L2BUF_IN_DRIVER;
-+    pthread_cond_broadcast(&avbuf->context->cond);
-+
-     av_log(logger(avbuf), AV_LOG_DEBUG, "--- %s VIDIOC_QBUF: index %d, ts=%ld.%06ld count=%d\n",
-            avbuf->context->name, avbuf->buf.index,
-            avbuf->buf.timestamp.tv_sec, avbuf->buf.timestamp.tv_usec, qc);
- 
--    avbuf->status = V4L2BUF_IN_DRIVER;
--
-     return 0;
- }
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 53b522d43e..7ddb759810 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -300,6 +300,7 @@ static int v4l2_stop_encode(V4L2Context *ctx)
- // Returns:
- //  0               Success
- //  AVERROR(EPIPE)  Nothing more to read
-+//  AVERROR(ENOSPC) No buffers in Q to put result in
- //  *               AVERROR(..)
- 
-  static int
-@@ -457,7 +458,7 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
-             (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
-             (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
-             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
--            return AVERROR(EAGAIN);
-+            return AVERROR(ENOSPC);
-         }
- 
-         // Timeout kludged s.t. "forever" eventually gives up & produces logging
-@@ -864,7 +865,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-     int rv;
- 
-     if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
--        return rv;
-+        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
- 
-     return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
- }
-@@ -938,6 +939,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
-     av_buffer_unref(&ctx->frames_ref);
- 
-     ff_mutex_destroy(&ctx->lock);
-+    pthread_cond_destroy(&ctx->cond);
- }
- 
- 
-@@ -1013,6 +1015,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-     }
- 
-     ff_mutex_init(&ctx->lock, NULL);
-+    pthread_cond_init(&ctx->cond, NULL);
-     atomic_init(&ctx->q_count, 0);
- 
-     if (s->output_drm) {
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 565858a1ed..0efff58f18 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -116,6 +116,7 @@ typedef struct V4L2Context {
-     struct ff_weak_link_master *wl_master;
- 
-     AVMutex lock;
-+    pthread_cond_t cond;
- } V4L2Context;
- 
- /**
-@@ -182,6 +183,8 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
-  * @param[in] timeout The timeout for dequeue (-1 to block, 0 to return immediately, or milliseconds)
-  *
-  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
-+ *                AVERROR(ENOSPC) if no buffer availible to put
-+ *                the frame in
-  */
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* f, int timeout);
- 
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index f1923bb26d..9a20447030 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -105,6 +105,8 @@ typedef struct V4L2m2mContext {
- 
-     /* Frame tracking */
-     xlat_track_t xlat;
-+    int pending_hw;
-+    int pending_n;
- 
-     pts_stats_t pts_stat;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index e2b10f5e3a..2e30449dfc 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -251,7 +251,8 @@ xlat_pts_out(AVCodecContext *const avctx,
- 
-     frame->best_effort_timestamp = pts_stats_guess(ps);
-     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
--    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n", frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-     return 0;
- }
- 
-@@ -422,6 +423,36 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-     return ret;
- }
- 
-+static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
-+{
-+    int rv = 0;
-+
-+    ff_mutex_lock(&ctx->lock);
-+
-+    while (atomic_load(&ctx->q_count) == 0 && ctx->streamon) {
-+        if (pthread_cond_wait(&ctx->cond, &ctx->lock) != 0) {
-+            rv = AVERROR(errno);
-+            av_log(avctx, AV_LOG_ERROR, "Cond wait failure: %s\n", av_err2str(rv));
-+            break;
-+        }
-+    }
-+
-+    ff_mutex_unlock(&ctx->lock);
-+    return rv;
-+}
-+
-+// Number of frames over what xlat_pending returns that we keep *16
-+// This is a min value - if it appears to be too small the threshold should
-+// adjust dynamically.
-+#define PENDING_HW_MIN      (3 * 16)
-+// Offset to use when setting dynamically
-+// Set to %16 == 15 to avoid the threshold changing immediately as we relax
-+#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
-+// Number of consecutive times we've failed to get a frame when we prefer it
-+// before we increase the prefer threshold (5ms * N = max expected decode
-+// time)
-+#define PENDING_N_THRESHOLD 6
-+
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-@@ -431,7 +462,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- 
-     do {
-         const int pending = xlat_pending(&s->xlat);
--        const int prefer_dq = (pending > 5);
-+        const int prefer_dq = (pending > s->pending_hw / 16);
- 
-         // Enqueue another pkt for decode if
-         // (a) We don't have a lot of stuff in the buffer already OR
-@@ -465,6 +496,27 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 // there is room in the input Q and timeout == -1
-                 dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
- 
-+                // Failure due to no buffer in Q?
-+                if (dst_rv == AVERROR(ENOSPC)) {
-+                    // Wait & retry
-+                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-+                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-+                    }
-+                }
-+
-+                // Adjust dynamic pending threshold
-+                if (dst_rv == 0) {
-+                    if (--s->pending_hw < PENDING_HW_MIN)
-+                        s->pending_hw = PENDING_HW_MIN;
-+                    s->pending_n = 0;
-+                }
-+                else if (dst_rv == AVERROR(EAGAIN)) {
-+                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-+                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-+                        s->pending_n = 0;
-+                    }
-+                }
-+
-                 if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-                     av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-                     dst_rv = AVERROR_EOF;
-@@ -613,6 +665,7 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
-     xlat_init(&s->xlat);
-     pts_stats_init(&s->pts_stat, avctx, "decoder");
-+    s->pending_hw = PENDING_HW_MIN;
- 
-     capture = &s->capture;
-     output = &s->output;
-
-From 33765b769b4301e03f31b65e225fcdb0eff4c0e4 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 25 Mar 2022 15:37:58 +0000
-Subject: [PATCH 043/136] v4l2_m2m2_dec: Avoid loop if unable to resize buffers
-
-If source change signals a buffer size that cannot be honored give up
-rather than looping indefinitely.  This happens on Pi if (say) a
-2560x1440 h264 stream is presented to the decode.
----
- libavcodec/v4l2_context.c | 13 +++++++++++--
- 1 file changed, 11 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 7ddb759810..007a58c8f1 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -205,8 +205,9 @@ static int do_source_change(V4L2m2mContext * const s)
- 
-     s->capture.sample_aspect_ratio = v4l2_get_sar(&s->capture);
- 
--    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, crop %dx%d @ %d,%d, reinit=%d\n",
-+    av_log(avctx, AV_LOG_DEBUG, "Source change: SAR: %d/%d, wxh %dx%d crop %dx%d @ %d,%d, reinit=%d\n",
-            s->capture.sample_aspect_ratio.num, s->capture.sample_aspect_ratio.den,
-+           s->capture.width, s->capture.height,
-            s->capture.selection.width, s->capture.selection.height,
-            s->capture.selection.left, s->capture.selection.top, reinit);
- 
-@@ -224,9 +225,17 @@ static int do_source_change(V4L2m2mContext * const s)
-             return AVERROR(EINVAL);
-         }
- 
-+        if (s->capture.width > ff_v4l2_get_format_width(&s->capture.format) ||
-+            s->capture.height > ff_v4l2_get_format_height(&s->capture.format)) {
-+            av_log(avctx, AV_LOG_ERROR, "Format post reinit too small: wanted %dx%d > got %dx%d\n",
-+                   s->capture.width, s->capture.height,
-+                   ff_v4l2_get_format_width(&s->capture.format), ff_v4l2_get_format_height(&s->capture.format));
-+            return AVERROR(EINVAL);
-+        }
-+
-         // Update pixel format - should only actually do something on initial change
-         s->capture.av_pix_fmt =
--        ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
-+            ff_v4l2_format_v4l2_to_avfmt(ff_v4l2_get_format_pixelformat(&s->capture.format), AV_CODEC_ID_RAWVIDEO);
-         if (s->output_drm) {
-             avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-             avctx->sw_pix_fmt = s->capture.av_pix_fmt;
-
-From bb7ad2392ce83149a1ba40ecacb36e051b6bf785 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 25 Mar 2022 18:14:40 +0000
-Subject: [PATCH 044/136] v4l2dec: Improve size/format validation on init
-
----
- libavcodec/v4l2_m2m_dec.c      | 84 ++++++++++++++++++++++++++++++++--
- libavcodec/v4l2_request_hevc.c | 11 +++++
- 2 files changed, 92 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 2e30449dfc..8dcadf461b 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -592,6 +592,76 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- }
- #endif
- 
-+static int
-+check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+{
-+    unsigned int i;
-+    const uint32_t fcc = ff_v4l2_get_format_pixelformat(&s->capture.format);
-+    const uint32_t w = avctx->coded_width;
-+    const uint32_t h = avctx->coded_height;
-+
-+    if (w == 0 || h == 0 || fcc == 0) {
-+        av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
-+        return 0;
-+    }
-+
-+    for (i = 0;; ++i) {
-+        struct v4l2_frmsizeenum fs = {
-+            .index = i,
-+            .pixel_format = fcc,
-+        };
-+
-+        while (ioctl(s->fd, VIDIOC_ENUM_FRAMESIZES, &fs) != 0) {
-+            const int err = AVERROR(errno);
-+            if (err == AVERROR(EINTR))
-+                continue;
-+            if (i == 0 && err == AVERROR(ENOTTY)) {
-+                av_log(avctx, AV_LOG_DEBUG, "Framesize enum not supported\n");
-+                return 0;
-+            }
-+            if (err != AVERROR(EINVAL)) {
-+                av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
-+                return err;
-+            }
-+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
-+                   w, h, av_fourcc2str(fcc));
-+            return err;
-+        }
-+
-+        switch (fs.type) {
-+            case V4L2_FRMSIZE_TYPE_DISCRETE:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Discrete: %dx%d\n", __func__, i,
-+                       fs.discrete.width,fs.discrete.height);
-+                if (w == fs.discrete.width && h == fs.discrete.height)
-+                    return 0;
-+                break;
-+            case V4L2_FRMSIZE_TYPE_STEPWISE:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Stepwise: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
-+                       fs.stepwise.min_width, fs.stepwise.min_height,
-+                       fs.stepwise.max_width, fs.stepwise.max_height,
-+                       fs.stepwise.step_width,fs.stepwise.step_height);
-+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
-+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height &&
-+                    (w - fs.stepwise.min_width) % fs.stepwise.step_width == 0 &&
-+                    (h - fs.stepwise.min_height) % fs.stepwise.step_height == 0)
-+                    return 0;
-+                break;
-+            case V4L2_FRMSIZE_TYPE_CONTINUOUS:
-+                av_log(avctx, AV_LOG_TRACE, "%s[%d]: Continuous: Min: %dx%d Max: %dx%d, Step: %dx%d\n", __func__, i,
-+                       fs.stepwise.min_width, fs.stepwise.min_height,
-+                       fs.stepwise.max_width, fs.stepwise.max_height,
-+                       fs.stepwise.step_width,fs.stepwise.step_height);
-+                if (w >= fs.stepwise.min_width && w <= fs.stepwise.max_width &&
-+                    h >= fs.stepwise.min_height && h <= fs.stepwise.max_height)
-+                    return 0;
-+                break;
-+            default:
-+                av_log(avctx, AV_LOG_ERROR, "Unexpected framesize enum: %d", fs.type);
-+                return AVERROR(EINVAL);
-+        }
-+    }
-+}
-+
- static int
- get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
- {
-@@ -698,8 +768,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- 
-     avctx->sw_pix_fmt = avctx->pix_fmt;
-     gf_pix_fmt = ff_get_format(avctx, avctx->codec->pix_fmts);
--    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s); get_format requested=%d (%s)\n",
--           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt), gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
-+    av_log(avctx, AV_LOG_DEBUG, "avctx requested=%d (%s) %dx%d; get_format requested=%d (%s)\n",
-+           avctx->pix_fmt, av_get_pix_fmt_name(avctx->pix_fmt),
-+           avctx->coded_width, avctx->coded_height,
-+           gf_pix_fmt, av_get_pix_fmt_name(gf_pix_fmt));
- 
-     if (gf_pix_fmt == AV_PIX_FMT_DRM_PRIME || avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME) {
-         avctx->pix_fmt = AV_PIX_FMT_DRM_PRIME;
-@@ -730,7 +802,13 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if ((ret = v4l2_prepare_decoder(s)) < 0)
-         return ret;
- 
--    return get_quirks(avctx, s);
-+    if ((ret = get_quirks(avctx, s)) != 0)
-+        return ret;
-+
-+    if ((ret = check_size(avctx, s)) != 0)
-+        return ret;
-+
-+    return 0;
- }
- 
- static av_cold int v4l2_decode_close(AVCodecContext *avctx)
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index b0a5930844..76ab0916cd 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -147,6 +147,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
- 
-     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
- 
-+    // Give up immediately if this is something that we have no code to deal with
-+    if (h->ps.sps->chroma_format_idc != 1) {
-+        av_log(avctx, AV_LOG_WARNING, "chroma_format_idc(%d) != 1: Not implemented\n", h->ps.sps->chroma_format_idc);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+    if (!(h->ps.sps->bit_depth == 10 || h->ps.sps->bit_depth == 8) ||
-+        h->ps.sps->bit_depth != h->ps.sps->bit_depth_chroma) {
-+        av_log(avctx, AV_LOG_WARNING, "Bit depth Y:%d C:%d: Not implemented\n", h->ps.sps->bit_depth, h->ps.sps->bit_depth_chroma);
-+        return AVERROR_PATCHWELCOME;
-+    }
-+
-     if ((ret = devscan_build(avctx, &ctx->devscan)) != 0) {
-         av_log(avctx, AV_LOG_WARNING, "Failed to find any V4L2 devices\n");
-         return (AVERROR(-ret));
-
-From 4646b558c0e45f506578a5a452820f55983abc82 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 13 Apr 2022 16:05:56 +0000
-Subject: [PATCH 045/136] v4l2 stateless hevc: Add another API variation for
- linux 5.18
-
-This is probably going to be a short lived variation and may end up
-being reverted if no release using it ever ends up in the wild.
----
- libavcodec/Makefile            |   2 +-
- libavcodec/hevc-ctrls-v3.h     | 255 +++++++++++++++++++++++++++++++++
- libavcodec/v4l2_req_hevc_v3.c  |   3 +
- libavcodec/v4l2_req_hevc_vx.c  |  17 +++
- libavcodec/v4l2_req_media.c    |  15 +-
- libavcodec/v4l2_req_media.h    |   3 +
- libavcodec/v4l2_request_hevc.c |   6 +-
- libavcodec/v4l2_request_hevc.h |   1 +
- 8 files changed, 295 insertions(+), 7 deletions(-)
- create mode 100644 libavcodec/hevc-ctrls-v3.h
- create mode 100644 libavcodec/v4l2_req_hevc_v3.c
-
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index e1aa0ba014..2b3c16185d 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
- OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
--                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-diff --git a/libavcodec/hevc-ctrls-v3.h b/libavcodec/hevc-ctrls-v3.h
-new file mode 100644
-index 0000000000..4e35bd583d
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v3.h
-@@ -0,0 +1,255 @@
-+/* SPDX-License-Identifier: GPL-2.0 */
-+/*
-+ * These are the HEVC state controls for use with stateless HEVC
-+ * codec drivers.
-+ *
-+ * It turns out that these structs are not stable yet and will undergo
-+ * more changes. So keep them private until they are stable and ready to
-+ * become part of the official public API.
-+ */
-+
-+#ifndef _HEVC_CTRLS_H_
-+#define _HEVC_CTRLS_H_
-+
-+#include <linux/videodev2.h>
-+
-+/* The pixel format isn't stable at the moment and will likely be renamed. */
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SPS		(V4L2_CID_CODEC_BASE + 1008)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_PPS		(V4L2_CID_CODEC_BASE + 1009)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_BASE + 1010)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_BASE + 1011)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_BASE + 1012)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE	(V4L2_CID_CODEC_BASE + 1015)
-+#define V4L2_CID_MPEG_VIDEO_HEVC_START_CODE	(V4L2_CID_CODEC_BASE + 1016)
-+
-+/* enum v4l2_ctrl_type type values */
-+#define V4L2_CTRL_TYPE_HEVC_SPS 0x0120
-+#define V4L2_CTRL_TYPE_HEVC_PPS 0x0121
-+#define V4L2_CTRL_TYPE_HEVC_SLICE_PARAMS 0x0122
-+#define V4L2_CTRL_TYPE_HEVC_SCALING_MATRIX 0x0123
-+#define V4L2_CTRL_TYPE_HEVC_DECODE_PARAMS 0x0124
-+
-+enum v4l2_mpeg_video_hevc_decode_mode {
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_mpeg_video_hevc_start_code {
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE,
-+	V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/* The controls are not stable at the moment and will likely be reworked. */
-+struct v4l2_ctrl_hevc_sps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Sequence parameter set */
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-+
-+struct v4l2_ctrl_hevc_pps {
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture parameter set */
-+	__u8	num_extra_slice_header_bits;
-+	__u8	num_ref_idx_l0_default_active_minus1;
-+	__u8	num_ref_idx_l1_default_active_minus1;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+
-+	__u8	padding[4];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	flags;
-+	__u8	field_pic;
-+	__u16	pic_order_cnt[2];
-+	__u8	padding[2];
-+};
-+
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	padding[6];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-+
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_bit_offset;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__u16	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+
-+	__u8	padding[5];
-+
-+	__u32	entry_point_offset_minus1[256];
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-+
-+struct v4l2_ctrl_hevc_decode_params {
-+	__s32	pic_order_cnt_val;
-+	__u8	num_active_dpb_entries;
-+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	num_poc_st_curr_before;
-+	__u8	num_poc_st_curr_after;
-+	__u8	num_poc_lt_curr;
-+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u64	flags;
-+};
-+
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+/*  MPEG-class control IDs specific to the Hantro driver as defined by V4L2 */
-+#define V4L2_CID_CODEC_HANTRO_BASE				(V4L2_CTRL_CLASS_CODEC | 0x1200)
-+/*
-+ * V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP -
-+ * the number of data (in bits) to skip in the
-+ * slice segment header.
-+ * If non-IDR, the bits to be skipped go from syntax element "pic_output_flag"
-+ * to before syntax element "slice_temporal_mvp_enabled_flag".
-+ * If IDR, the skipped bits are just "pic_output_flag"
-+ * (separate_colour_plane_flag is not supported).
-+ */
-+#define V4L2_CID_HANTRO_HEVC_SLICE_HEADER_SKIP	(V4L2_CID_CODEC_HANTRO_BASE + 0)
-+
-+#endif
-diff --git a/libavcodec/v4l2_req_hevc_v3.c b/libavcodec/v4l2_req_hevc_v3.c
-new file mode 100644
-index 0000000000..dcc8d95632
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v3.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 3
-+#include "v4l2_req_hevc_vx.c"
-+
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 0ae03b10c4..611fa21cc3 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -16,6 +16,8 @@
- 
- #elif HEVC_CTRLS_VERSION == 2
- #include "hevc-ctrls-v2.h"
-+#elif HEVC_CTRLS_VERSION == 3
-+#include "hevc-ctrls-v3.h"
- #else
- #error Unknown HEVC_CTRLS_VERSION
- #endif
-@@ -147,6 +149,7 @@ static void fill_pred_table(const HEVCContext *h, struct v4l2_hevc_pred_weight_t
-     }
- }
- 
-+#if HEVC_CTRLS_VERSION <= 2
- static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
- {
-     const HEVCFrame *frame;
-@@ -172,6 +175,7 @@ static int find_frame_rps_type(const HEVCContext *h, uint64_t timestamp)
- 
-     return 0;
- }
-+#endif
- 
- static unsigned int
- get_ref_pic_index(const HEVCContext *h, const HEVCFrame *frame,
-@@ -247,7 +251,12 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
-             struct v4l2_hevc_dpb_entry * const entry = entries + n++;
- 
-             entry->timestamp = frame_capture_dpb(frame->frame);
-+#if HEVC_CTRLS_VERSION <= 2
-             entry->rps = find_frame_rps_type(h, entry->timestamp);
-+#else
-+            entry->flags = (frame->flags & HEVC_FRAME_FLAG_LONG_REF) == 0 ? 0 :
-+                V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE;
-+#endif
-             entry->field_pic = frame->frame->interlaced_frame;
- 
-             /* TODO: Interleaved: Get the POC for each field. */
-@@ -1011,6 +1020,14 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     };
-     const unsigned int noof_ctrls = FF_ARRAY_ELEMS(qc);
- 
-+#if HEVC_CTRLS_VERSION == 2
-+    if (mediabufs_ctl_driver_version(ctx->mbufs) >= MEDIABUFS_DRIVER_VERSION(5, 18, 0))
-+        return AVERROR(EINVAL);
-+#elif HEVC_CTRLS_VERSION == 3
-+    if (mediabufs_ctl_driver_version(ctx->mbufs) < MEDIABUFS_DRIVER_VERSION(5, 18, 0))
-+        return AVERROR(EINVAL);
-+#endif
-+
-     if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
-         av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
-         return AVERROR(EINVAL);
-diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
-index eb00ecb406..980b306b8a 100644
---- a/libavcodec/v4l2_req_media.c
-+++ b/libavcodec/v4l2_req_media.c
-@@ -604,6 +604,7 @@ struct mediabufs_ctl {
- 
-     struct v4l2_format src_fmt;
-     struct v4l2_format dst_fmt;
-+    struct v4l2_capability capability;
- };
- 
- static int qe_v4l2_queue(struct qent_base *const be,
-@@ -1498,20 +1499,24 @@ void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc)
-     mediabufs_ctl_delete(mbc);
- }
- 
-+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc)
-+{
-+    return mbc->capability.version;
-+}
-+
- static int set_capabilities(struct mediabufs_ctl *const mbc)
- {
--    struct v4l2_capability capability = { 0 };
-     uint32_t caps;
- 
--    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &capability)) {
-+    if (ioctl(mbc->vfd, VIDIOC_QUERYCAP, &mbc->capability)) {
-         int err = errno;
-         request_err(mbc->dc, "Failed to get capabilities: %s\n", strerror(err));
-         return -err;
-     }
- 
--    caps = (capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
--            capability.device_caps :
--            capability.capabilities;
-+    caps = (mbc->capability.capabilities & V4L2_CAP_DEVICE_CAPS) != 0 ?
-+            mbc->capability.device_caps :
-+            mbc->capability.capabilities;
- 
-     if ((caps & V4L2_CAP_VIDEO_M2M_MPLANE) != 0) {
-         mbc->src_fmt.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
-index 2f826cfb14..0307a831de 100644
---- a/libavcodec/v4l2_req_media.h
-+++ b/libavcodec/v4l2_req_media.h
-@@ -142,6 +142,9 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
-                   struct dmabufs_ctl * const dbsc,
-                   unsigned int n);
- 
-+#define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
-+unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
-+
- struct mediabufs_ctl * mediabufs_ctl_new(void * const dc,
-                      const char *vpath, struct pollqueue *const pq);
- void mediabufs_ctl_unref(struct mediabufs_ctl **const pmbc);
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 76ab0916cd..20e4e0ab15 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-         goto fail4;
-     }
- 
--    if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
-+    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 3);
-+    }
-+    else if (V2(ff_v4l2_req_hevc, 2).probe(avctx, ctx) == 0) {
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 2 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 2);
-     }
-diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
-index f14f594564..ed48d62e2d 100644
---- a/libavcodec/v4l2_request_hevc.h
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -98,5 +98,6 @@ typedef struct v4l2_req_decode_fns {
- 
- extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
- extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
- 
- #endif
-
-From 92160173e701aa7e2f1011e63596e48d15e691a9 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 3 May 2022 12:44:42 +0000
-Subject: [PATCH 046/136] Remove V4l2 frame size check for meson-vdec
-
----
- libavcodec/v4l2_m2m.h     |  3 ++-
- libavcodec/v4l2_m2m_dec.c | 10 +++++++---
- 2 files changed, 9 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 9a20447030..6bd5e8eda7 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -116,7 +116,8 @@ typedef struct V4L2m2mContext {
-     /* Ext data sent */
-     int extdata_sent;
- 
--#define FF_V4L2_QUIRK_REINIT_ALWAYS     1
-+#define FF_V4L2_QUIRK_REINIT_ALWAYS             1
-+#define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
-     /* Quirks */
-     unsigned int quirks;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 8dcadf461b..888ba67fea 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -604,6 +604,10 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
-         av_log(avctx, AV_LOG_TRACE, "%s: Size %dx%d or fcc %s empty\n", __func__, w, h, av_fourcc2str(fcc));
-         return 0;
-     }
-+    if ((s->quirks & FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN) != 0) {
-+        av_log(avctx, AV_LOG_TRACE, "%s: Skipped (quirk): Size %dx%d, fcc %s\n", __func__, w, h, av_fourcc2str(fcc));
-+        return 0;
-+    }
- 
-     for (i = 0;; ++i) {
-         struct v4l2_frmsizeenum fs = {
-@@ -623,8 +627,8 @@ check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
-                 av_log(avctx, AV_LOG_ERROR, "Failed to enum framesizes: %s", av_err2str(err));
-                 return err;
-             }
--            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in frame size enums\n",
--                   w, h, av_fourcc2str(fcc));
-+            av_log(avctx, AV_LOG_WARNING, "Failed to find Size=%dx%d, fmt=%s in %u frame size enums\n",
-+                   w, h, av_fourcc2str(fcc), i);
-             return err;
-         }
- 
-@@ -684,7 +688,7 @@ get_quirks(AVCodecContext * const avctx, V4L2m2mContext * const s)
-     // capture to clear the event even if the capture buffers were the right
-     // size in the first place.
-     if (strcmp(cap.driver, "meson-vdec") == 0)
--        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS;
-+        s->quirks |= FF_V4L2_QUIRK_REINIT_ALWAYS | FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN;
- 
-     av_log(avctx, AV_LOG_DEBUG, "Driver '%s': Quirks=%#x\n", cap.driver, s->quirks);
-     return 0;
-
-From 8ba5576e7fcd24c2f450f0295cc3b6d8e82e8649 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 23 May 2022 18:05:20 +0100
-Subject: [PATCH 047/136] v4l2m2m_dec: Make some error rturns a bit more robust
-
----
- libavcodec/v4l2_context.c |  5 ++---
- libavcodec/v4l2_m2m_dec.c | 23 ++++++++++++++---------
- 2 files changed, 16 insertions(+), 12 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 007a58c8f1..b3662aedaa 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -765,7 +765,7 @@ static int stuff_all_buffers(AVCodecContext * avctx, V4L2Context* ctx)
- int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
- {
-     int type = ctx->type;
--    int ret;
-+    int ret = 0;
-     AVCodecContext * const avctx = logger(ctx);
- 
-     // Avoid doing anything if there is nothing we can do
-@@ -777,8 +777,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
-     if (cmd == VIDIOC_STREAMON && !V4L2_TYPE_IS_OUTPUT(ctx->type))
-         stuff_all_buffers(avctx, ctx);
- 
--    ret = ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type);
--    if (ret < 0) {
-+    if (ioctl(ctx_to_m2mctx(ctx)->fd, cmd, &type) < 0) {
-         const int err = errno;
-         av_log(avctx, AV_LOG_ERROR, "%s set status %d (%s) failed: err=%d\n", ctx->name,
-                cmd, (cmd == VIDIOC_STREAMON) ? "ON" : "OFF", err);
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 888ba67fea..88a341aae2 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -110,16 +110,21 @@ static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *co
-         return 0;
- 
-     ret = ff_v4l2_context_set_status(&s->output, VIDIOC_STREAMON);
--    if (ret < 0)
--        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context\n");
--
--    ret = ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd);
--    if (ret < 0)
--        av_log(avctx, AV_LOG_ERROR, "VIDIOC_DECODER_CMD start error: %d\n", errno);
--    else
--        av_log(avctx, AV_LOG_DEBUG, "VIDIOC_DECODER_CMD start OK\n");
-+    if (ret != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "VIDIOC_STREAMON on output context: %s\n", av_err2str(ret));
-+        return ret;
-+    }
- 
--    return ret;
-+    // STREAMON should do implicit START so this just for those that don't.
-+    // It is optional so don't worry if it fails
-+    if (ioctl(s->fd, VIDIOC_DECODER_CMD, &cmd) < 0) {
-+        ret = AVERROR(errno);
-+        av_log(avctx, AV_LOG_WARNING, "VIDIOC_DECODER_CMD start error: %s\n", av_err2str(ret));
-+    }
-+    else {
-+        av_log(avctx, AV_LOG_TRACE, "VIDIOC_DECODER_CMD start OK\n");
-+    }
-+    return 0;
- }
- 
- static int v4l2_try_start(AVCodecContext *avctx)
-
-From aafa5968f8713319be35cf26069c98566d5bf59b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 24 May 2022 17:02:58 +0000
-Subject: [PATCH 048/136] v4l2m2m_dec: Support in-pkt AV_PKT_DATA_NEW_EXTRADATA
-
-Support packet side-data containing AV_PKT_DATA_NEW_EXTRADATA.  Should
-also detect and complain about unexpected streams of empty packets.
-
-This functionality untested as I haven't yet found anything that creates
-NEW_EXTRADATA side data.
----
- libavcodec/v4l2_m2m.c     |  1 +
- libavcodec/v4l2_m2m.h     |  3 +++
- libavcodec/v4l2_m2m_dec.c | 49 ++++++++++++++++++++++++++++++++++++---
- 3 files changed, 50 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index e26bd74c3e..6dd01e2e00 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -251,6 +251,7 @@ static void v4l2_m2m_destroy_context(void *opaque, uint8_t *context)
-     av_frame_unref(s->frame);
-     av_frame_free(&s->frame);
-     av_packet_unref(&s->buf_pkt);
-+    av_freep(&s->extdata_data);
- 
-     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Context destroyed\n");
- 
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 6bd5e8eda7..19d618698d 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -115,6 +115,9 @@ typedef struct V4L2m2mContext {
- 
-     /* Ext data sent */
-     int extdata_sent;
-+    /* Ext data sent in packet - overrides ctx */
-+    uint8_t * extdata_data;
-+    size_t extdata_size;
- 
- #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
- #define FF_V4L2_QUIRK_ENUM_FRAMESIZES_BROKEN    2
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 88a341aae2..392a68f0c7 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -343,7 +343,46 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-     // We will already have a coded pkt if the output Q was full last time we
-     // tried to Q it
-     if (!s->buf_pkt.size && !do_not_get) {
--        ret = ff_decode_get_packet(avctx, &s->buf_pkt);
-+        unsigned int i;
-+
-+        for (i = 0; i < 256; ++i) {
-+            uint8_t * side_data;
-+            size_t side_size;
-+
-+            ret = ff_decode_get_packet(avctx, &s->buf_pkt);
-+            if (ret != 0)
-+                break;
-+
-+            // New extradata is the only side-data we undertand
-+            side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
-+            if (side_data) {
-+                av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
-+                av_freep(&s->extdata_data);
-+                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
-+                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
-+                    return AVERROR(ENOMEM);
-+                }
-+                memcpy(s->extdata_data, side_data, side_size);
-+                s->extdata_size = side_size;
-+                s->extdata_sent = 0;
-+            }
-+
-+            if (s->buf_pkt.size != 0)
-+                break;
-+
-+            if (s->buf_pkt.side_data_elems == 0) {
-+                av_log(avctx, AV_LOG_WARNING, "Empty pkt from ff_decode_get_packet - treating as EOF\n");
-+                ret = AVERROR_EOF;
-+                break;
-+            }
-+
-+            // Retry a side-data only pkt
-+        }
-+        // If i >= 256 something has gone wrong
-+        if (i >= 256) {
-+            av_log(avctx, AV_LOG_ERROR, "Too many side-data only packets\n");
-+            return AVERROR(EIO);
-+        }
- 
-         if (ret == AVERROR(EAGAIN)) {
-             if (!stream_started(s)) {
-@@ -398,8 +437,12 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-     if ((ret = check_output_streamon(avctx, s)) != 0)
-         return ret;
- 
--    ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt,
--                                         avctx->extradata, s->extdata_sent ? 0 : avctx->extradata_size);
-+    if (s->extdata_sent)
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-+    else if (s->extdata_data)
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
-+    else
-+        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
- 
-     if (ret == AVERROR(EAGAIN)) {
-         // Out of input buffers - keep packet
-
-From e9bced67bdb40096d31067d41956276e9e1af11a Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 24 May 2022 20:02:48 +0000
-Subject: [PATCH 049/136] v4l2m2m_dec: Catch repeated Q fulls
-
----
- libavcodec/v4l2_m2m_dec.c | 8 +++++++-
- 1 file changed, 7 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 392a68f0c7..7e17044706 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -504,13 +504,14 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    int src_rv;
-+    int src_rv = NQ_OK;
-     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-     unsigned int i = 0;
- 
-     do {
-         const int pending = xlat_pending(&s->xlat);
-         const int prefer_dq = (pending > s->pending_hw / 16);
-+        const int last_src_rv = src_rv;
- 
-         // Enqueue another pkt for decode if
-         // (a) We don't have a lot of stuff in the buffer already OR
-@@ -526,6 +527,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         if ((i != 0 || s->req_pkt) && src_rv == NQ_SRC_EMPTY)
-             break;
- 
-+        if (src_rv == NQ_Q_FULL && last_src_rv == NQ_Q_FULL) {
-+            av_log(avctx, AV_LOG_WARNING, "Poll thinks src Q has space; none found\n");
-+            break;
-+        }
-+
-         // Try to get a new frame if
-         // (a) we haven't already got one AND
-         // (b) enqueue returned a status indicating that decode should be attempted
-
-From 0c974e4da2c0311836145f2fd42081d40eb15998 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 25 May 2022 15:22:12 +0000
-Subject: [PATCH 050/136] Remove requirement for epoxy & libudev config options
-
----
- configure              | 26 +++++++++++++++++---------
- pi-util/conf_native.sh |  2 --
- 2 files changed, 17 insertions(+), 11 deletions(-)
-
-diff --git a/configure b/configure
-index b41663c794..fdc95146bf 100755
---- a/configure
-+++ b/configure
-@@ -205,6 +205,7 @@ External library support:
-   --disable-bzlib          disable bzlib [autodetect]
-   --disable-coreimage      disable Apple CoreImage framework [autodetect]
-   --enable-chromaprint     enable audio fingerprinting with chromaprint [no]
-+  --disable-epoxy          disable epoxy [autodetect]
-   --enable-frei0r          enable frei0r video filtering [no]
-   --enable-gcrypt          enable gcrypt, needed for rtmp(t)e support
-                            if openssl, librtmp or gmp is not used [no]
-@@ -281,7 +282,7 @@ External library support:
-                            if openssl, gnutls or mbedtls is not used [no]
-   --enable-libtwolame      enable MP2 encoding via libtwolame [no]
-   --enable-libuavs3d       enable AVS3 decoding via libuavs3d [no]
--  --enable-libudev         enable libudev [no]
-+  --disable-libudev        disable libudev [autodetect]
-   --enable-libv4l2         enable libv4l2/v4l-utils [no]
-   --enable-libvidstab      enable video stabilization using vid.stab [no]
-   --enable-libvmaf         enable vmaf filter via libvmaf [no]
-@@ -1747,7 +1748,9 @@ EXTERNAL_AUTODETECT_LIBRARY_LIST="
-     avfoundation
-     bzlib
-     coreimage
-+    epoxy
-     iconv
-+    libudev
-     libxcb
-     libxcb_shm
-     libxcb_shape
-@@ -1819,7 +1822,6 @@ EXTERNAL_LIBRARY_LIST="
-     libdav1d
-     libdc1394
-     libdrm
--    epoxy
-     libflite
-     libfontconfig
-     libfreetype
-@@ -1863,7 +1865,6 @@ EXTERNAL_LIBRARY_LIST="
-     libtheora
-     libtwolame
-     libuavs3d
--    libudev
-     libv4l2
-     libvmaf
-     libvorbis
-@@ -3567,9 +3568,8 @@ v4l2_indev_suggest="libv4l2"
- v4l2_outdev_deps="libdrm"
- v4l2_outdev_deps_any="linux_videodev2_h sys_videoio_h"
- v4l2_outdev_suggest="libv4l2"
--vout_drm_outdev_deps="libdrm vout_drm"
--vout_egl_outdev_deps="xlib"
--vout_egl_outdev_select="epoxy"
-+vout_drm_outdev_deps="libdrm"
-+vout_egl_outdev_deps="xlib epoxy"
- vfwcap_indev_deps="vfw32 vfwcap_defines"
- xcbgrab_indev_deps="libxcb"
- xcbgrab_indev_suggest="libxcb_shm libxcb_shape libxcb_xfixes"
-@@ -6355,6 +6355,12 @@ if enabled xlib; then
-         disable xlib
- fi
- 
-+enabled libudev &&
-+    check_pkg_config libudev libudev libudev.h udev_new
-+
-+enabled epoxy &&
-+    check_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
-+
- check_headers direct.h
- check_headers dirent.h
- check_headers dxgidebug.h
-@@ -6601,7 +6607,6 @@ enabled libdav1d          && require_pkg_config libdav1d "dav1d >= 0.5.0" "dav1d
- enabled libdavs2          && require_pkg_config libdavs2 "davs2 >= 1.6.0" davs2.h davs2_decoder_open
- enabled libdc1394         && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
- enabled libdrm            && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
--enabled epoxy             && require_pkg_config epoxy epoxy epoxy/egl.h epoxy_egl_version
- enabled libfdk_aac        && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
-                                { require libfdk_aac fdk-aac/aacenc_lib.h aacEncOpen -lfdk-aac &&
-                                  warn "using libfdk without pkg-config"; } }
-@@ -6713,7 +6718,6 @@ enabled libtwolame        && require libtwolame twolame.h twolame_init -ltwolame
-                              { check_lib libtwolame twolame.h twolame_encode_buffer_float32_interleaved -ltwolame ||
-                                die "ERROR: libtwolame must be installed and version must be >= 0.3.10"; }
- enabled libuavs3d         && require_pkg_config libuavs3d "uavs3d >= 1.1.41" uavs3d.h uavs3d_decode
--enabled libudev           && require_pkg_config libudev libudev libudev.h udev_new
- enabled libv4l2           && require_pkg_config libv4l2 libv4l2 libv4l2.h v4l2_ioctl
- enabled libvidstab        && require_pkg_config libvidstab "vidstab >= 0.98" vid.stab/libvidstab.h vsMotionDetectInit
- enabled libvmaf           && require_pkg_config libvmaf "libvmaf >= 2.0.0" libvmaf.h vmaf_init
-@@ -6819,9 +6823,13 @@ enabled rkmpp             && { require_pkg_config rkmpp rockchip_mpp  rockchip/r
- enabled v4l2_request      && { enabled libdrm ||
-                                die "ERROR: v4l2-request requires --enable-libdrm"; } &&
-                              { enabled libudev ||
--                               die "ERROR: v4l2-request requires --enable-libudev"; }
-+                               die "ERROR: v4l2-request requires libudev"; }
- enabled vapoursynth       && require_pkg_config vapoursynth "vapoursynth-script >= 42" VSScript.h vsscript_init
- 
-+enabled vout_drm && { enabled libdrm || die "ERROR: vout_drm requires --enable-libdrm"; }
-+
-+enabled vout_egl && { enabled epoxy || die "ERROR: vout_egl requires epoxy"; } &&
-+                    { enabled xlib  || die "ERROR: vout_egl requires xlib"; }
- 
- if enabled gcrypt; then
-     GCRYPT_CONFIG="${cross_prefix}libgcrypt-config"
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-index 65576846e8..37cea71756 100755
---- a/pi-util/conf_native.sh
-+++ b/pi-util/conf_native.sh
-@@ -91,8 +91,6 @@ $FFSRC/configure \
-  --disable-thumb\
-  --enable-v4l2-request\
-  --enable-libdrm\
-- --enable-epoxy\
-- --enable-libudev\
-  --enable-vout-egl\
-  --enable-vout-drm\
-  $SHARED_LIBS\
-
-From 9f234d8cbde2829e6a70fd3cb6324998df8a31f3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 27 May 2022 09:36:51 +0000
-Subject: [PATCH 051/136] hevc: If hwaccel avoid creation of s/w only vars
-
----
- libavcodec/hevc_refs.c | 35 +++++++++++++++++++++--------------
- libavcodec/hevcdec.c   | 42 +++++++++++++++++++++++++++++-------------
- 2 files changed, 50 insertions(+), 27 deletions(-)
-
-diff --git a/libavcodec/hevc_refs.c b/libavcodec/hevc_refs.c
-index 811e8feff8..f7cf14eabc 100644
---- a/libavcodec/hevc_refs.c
-+++ b/libavcodec/hevc_refs.c
-@@ -98,18 +98,22 @@ static HEVCFrame *alloc_frame(HEVCContext *s)
-         if (!frame->rpl_buf)
-             goto fail;
- 
--        frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
--        if (!frame->tab_mvf_buf)
--            goto fail;
--        frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
-+        if (s->tab_mvf_pool) {
-+            frame->tab_mvf_buf = av_buffer_pool_get(s->tab_mvf_pool);
-+            if (!frame->tab_mvf_buf)
-+                goto fail;
-+            frame->tab_mvf = (MvField *)frame->tab_mvf_buf->data;
-+        }
- 
--        frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
--        if (!frame->rpl_tab_buf)
--            goto fail;
--        frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
--        frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
--        for (j = 0; j < frame->ctb_count; j++)
--            frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
-+        if (s->rpl_tab_pool) {
-+            frame->rpl_tab_buf = av_buffer_pool_get(s->rpl_tab_pool);
-+            if (!frame->rpl_tab_buf)
-+                goto fail;
-+            frame->rpl_tab   = (RefPicListTab **)frame->rpl_tab_buf->data;
-+            frame->ctb_count = s->ps.sps->ctb_width * s->ps.sps->ctb_height;
-+            for (j = 0; j < frame->ctb_count; j++)
-+                frame->rpl_tab[j] = (RefPicListTab *)frame->rpl_buf->data;
-+        }
- 
-         frame->frame->top_field_first  = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-         frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-@@ -297,14 +301,17 @@ static int init_slice_rpl(HEVCContext *s)
-     int ctb_count    = frame->ctb_count;
-     int ctb_addr_ts  = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-     int i;
-+    RefPicListTab * const tab = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
- 
-     if (s->slice_idx >= frame->rpl_buf->size / sizeof(RefPicListTab))
-         return AVERROR_INVALIDDATA;
- 
--    for (i = ctb_addr_ts; i < ctb_count; i++)
--        frame->rpl_tab[i] = (RefPicListTab *)frame->rpl_buf->data + s->slice_idx;
-+    if (frame->rpl_tab) {
-+        for (i = ctb_addr_ts; i < ctb_count; i++)
-+            frame->rpl_tab[i] = tab;
-+    }
- 
--    frame->refPicList = (RefPicList *)frame->rpl_tab[ctb_addr_ts];
-+    frame->refPicList = tab->refPicList;
- 
-     return 0;
- }
-diff --git a/libavcodec/hevcdec.c b/libavcodec/hevcdec.c
-index 2867cb2e16..17f53322fb 100644
---- a/libavcodec/hevcdec.c
-+++ b/libavcodec/hevcdec.c
-@@ -536,6 +536,16 @@ static int set_sps(HEVCContext *s, const HEVCSPS *sps,
-     if (!sps)
-         return 0;
- 
-+    // If hwaccel then we don't need all the s/w decode helper arrays
-+    if (s->avctx->hwaccel) {
-+        export_stream_params(s, sps);
-+
-+        s->avctx->pix_fmt = pix_fmt;
-+        s->ps.sps = sps;
-+        s->ps.vps = (HEVCVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+        return 0;
-+    }
-+
-     ret = pic_arrays_init(s, sps);
-     if (ret < 0)
-         goto fail;
-@@ -2890,11 +2900,13 @@ static int hevc_frame_start(HEVCContext *s)
-                            ((s->ps.sps->height >> s->ps.sps->log2_min_cb_size) + 1);
-     int ret;
- 
--    memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
--    memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
--    memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
--    memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
--    memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
-+    if (s->horizontal_bs) {
-+        memset(s->horizontal_bs, 0, s->bs_width * s->bs_height);
-+        memset(s->vertical_bs,   0, s->bs_width * s->bs_height);
-+        memset(s->cbf_luma,      0, s->ps.sps->min_tb_width * s->ps.sps->min_tb_height);
-+        memset(s->is_pcm,        0, (s->ps.sps->min_pu_width + 1) * (s->ps.sps->min_pu_height + 1));
-+        memset(s->tab_slice_address, -1, pic_size_in_ctb * sizeof(*s->tab_slice_address));
-+    }
- 
-     s->is_decoded        = 0;
-     s->first_nal_type    = s->nal_unit_type;
-@@ -3438,15 +3450,19 @@ static int hevc_ref_frame(HEVCContext *s, HEVCFrame *dst, HEVCFrame *src)
-         dst->needs_fg = 1;
-     }
- 
--    dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
--    if (!dst->tab_mvf_buf)
--        goto fail;
--    dst->tab_mvf = src->tab_mvf;
-+    if (src->tab_mvf_buf) {
-+        dst->tab_mvf_buf = av_buffer_ref(src->tab_mvf_buf);
-+        if (!dst->tab_mvf_buf)
-+            goto fail;
-+        dst->tab_mvf = src->tab_mvf;
-+    }
- 
--    dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
--    if (!dst->rpl_tab_buf)
--        goto fail;
--    dst->rpl_tab = src->rpl_tab;
-+    if (src->rpl_tab_buf) {
-+        dst->rpl_tab_buf = av_buffer_ref(src->rpl_tab_buf);
-+        if (!dst->rpl_tab_buf)
-+            goto fail;
-+        dst->rpl_tab = src->rpl_tab;
-+    }
- 
-     dst->rpl_buf = av_buffer_ref(src->rpl_buf);
-     if (!dst->rpl_buf)
-
-From bb2ddc480634141bed9afd3f66e7f63f5091bb2f Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 30 May 2022 17:51:44 +0100
-Subject: [PATCH 052/136] rpi_sand: Add SAND30->NV12 conversion
-
-C code only. Reworks the hwcontext_drm conversion to use the
-rpi_sand_fns generic frame convert fn rather than calling the
-individual conversion functions directly. This keeps all teh stride and
-size logic in a single place.
----
- libavutil/hwcontext_drm.c | 46 ++++++++------------
- libavutil/rpi_sand_fns.c  | 89 +++++++++++++++++++++++++++++++++++++++
- libavutil/rpi_sand_fns.h  |  5 +++
- 3 files changed, 111 insertions(+), 29 deletions(-)
-
-diff --git a/libavutil/hwcontext_drm.c b/libavutil/hwcontext_drm.c
-index baf18920fa..137a952d2c 100644
---- a/libavutil/hwcontext_drm.c
-+++ b/libavutil/hwcontext_drm.c
-@@ -234,14 +234,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
-                                     enum AVHWFrameTransferDirection dir,
-                                     enum AVPixelFormat **formats)
- {
--    enum AVPixelFormat *pix_fmts;
-+    enum AVPixelFormat *p;
- 
--    pix_fmts = av_malloc_array(2, sizeof(*pix_fmts));
--    if (!pix_fmts)
-+    p = *formats = av_malloc_array(3, sizeof(*p));
-+    if (!p)
-         return AVERROR(ENOMEM);
- 
-     // **** Offer native sand too ????
--    pix_fmts[0] =
-+    *p++ =
- #if CONFIG_SAND
-         ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128 ?
-             AV_PIX_FMT_YUV420P :
-@@ -249,9 +249,14 @@ static int drm_transfer_get_formats(AVHWFramesContext *ctx,
-             AV_PIX_FMT_YUV420P10LE :
- #endif
-             ctx->sw_format;
--    pix_fmts[1] = AV_PIX_FMT_NONE;
- 
--    *formats = pix_fmts;
-+#if CONFIG_SAND
-+    if (ctx->sw_format == AV_PIX_FMT_RPI4_10 ||
-+        ctx->sw_format == AV_PIX_FMT_RPI4_8 || ctx->sw_format == AV_PIX_FMT_SAND128)
-+        *p++ = AV_PIX_FMT_NV12;
-+#endif
-+
-+    *p = AV_PIX_FMT_NONE;
-     return 0;
- }
- 
-@@ -294,29 +299,12 @@ static int drm_transfer_data_from(AVHWFramesContext *hwfc,
-         const unsigned int w = FFMIN(dst->width, map->width);
-         const unsigned int h = FFMIN(dst->height, map->height);
- 
--        if (map->format == AV_PIX_FMT_RPI4_8 && dst->format == AV_PIX_FMT_YUV420P) {
--            av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
--                                     map->data[0],
--                                     128, stride2,
--                                     0, 0, w, h);
--            av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
--                                     dst->data[2], dst->linesize[2],
--                                     map->data[1],
--                                     128, stride2,
--                                     0, 0, w / 2, h / 2);
--        }
--        else if (map->format == AV_PIX_FMT_RPI4_10 && dst->format == AV_PIX_FMT_YUV420P10LE) {
--            av_rpi_sand30_to_planar_y16(dst->data[0], dst->linesize[0],
--                                     map->data[0],
--                                     128, stride2,
--                                     0, 0, w, h);
--            av_rpi_sand30_to_planar_c16(dst->data[1], dst->linesize[1],
--                                     dst->data[2], dst->linesize[2],
--                                     map->data[1],
--                                     128, stride2,
--                                     0, 0, w / 2, h / 2);
--        }
--        else
-+        map->crop_top = 0;
-+        map->crop_bottom = 0;
-+        map->crop_left = 0;
-+        map->crop_right = 0;
-+
-+        if (av_rpi_sand_to_planar_frame(dst, map) != 0)
-         {
-             av_log(hwfc, AV_LOG_ERROR, "%s: Incompatible output pixfmt for sand\n", __func__);
-             err = AVERROR(EINVAL);
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-index 1f543e9357..256c3d532f 100644
---- a/libavutil/rpi_sand_fns.c
-+++ b/libavutil/rpi_sand_fns.c
-@@ -229,6 +229,75 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
-     }
- }
- 
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// single lose bottom 2 bits truncation
-+// _x & _w in pixels, strides in bytes
-+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h)
-+{
-+    const unsigned int x0 = (_x / 3) * 4; // Byte offset of the word
-+    const unsigned int xskip0 = _x - (x0 >> 2) * 3;
-+    const unsigned int x1 = ((_x + _w) / 3) * 4;
-+    const unsigned int xrem1 = _x + _w - (x1 >> 2) * 3;
-+    const unsigned int mask = stride1 - 1;
-+    const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-+    const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
-+
-+#if HAVE_SAND_ASM && 0
-+    if (_x == 0) {
-+        ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
-+        return;
-+    }
-+#endif
-+
-+    if (x0 == x1) {
-+        // *******************
-+        // Partial single word xfer
-+        return;
-+    }
-+
-+    for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p0 += stride1)
-+    {
-+        unsigned int x = x0;
-+        const uint32_t * p = (const uint32_t *)p0;
-+        uint8_t * d = dst;
-+
-+        if (xskip0 != 0) {
-+            const uint32_t p3 = *p++;
-+
-+            if (xskip0 == 1)
-+                *d++ = (p3 >> 12) & 0xff;
-+            *d++ = (p3 >> 22) & 0xff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        while (x != x1) {
-+            const uint32_t p3 = *p++;
-+            *d++ = (p3 >> 2) & 0xff;
-+            *d++ = (p3 >> 12) & 0xff;
-+            *d++ = (p3 >> 22) & 0xff;
-+
-+            if (((x += 4) & mask) == 0)
-+                p += slice_inc;
-+        }
-+
-+        if (xrem1 != 0) {
-+            const uint32_t p3 = *p;
-+
-+            *d++ = (p3 >> 2) & 0xff;
-+            if (xrem1 == 2)
-+                *d++ = (p3 >> 12) & 0xff;
-+        }
-+    }
-+}
-+
-+
- 
- // w/h in pixels
- void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-@@ -310,6 +379,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-                                              x/2, y/2,  w/2, h/2);
-                     break;
-+                case AV_PIX_FMT_NV12:
-+                    av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand_to_planar_y8(dst->data[1], dst->linesize[1],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w, h/2);
-+                    break;
-                 default:
-                     return -1;
-             }
-@@ -344,6 +423,16 @@ int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-                                              av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-                                              x/2, y/2, w/2, h/2);
-                     break;
-+                case AV_PIX_FMT_NV12:
-+                    av_rpi_sand30_to_planar_y8(dst->data[0], dst->linesize[0],
-+                                             src->data[0],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x, y, w, h);
-+                    av_rpi_sand30_to_planar_y8(dst->data[1], dst->linesize[1],
-+                                             src->data[1],
-+                                             av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+                                             x/2, y/2, w, h/2);
-+                    break;
-                 default:
-                     return -1;
-             }
-diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
-index 634b55e800..462ccb8abd 100644
---- a/libavutil/rpi_sand_fns.h
-+++ b/libavutil/rpi_sand_fns.h
-@@ -85,6 +85,11 @@ void av_rpi_sand30_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_
-                              unsigned int _x, unsigned int y,
-                              unsigned int _w, unsigned int h);
- 
-+void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+                             const uint8_t * src,
-+                             unsigned int stride1, unsigned int stride2,
-+                             unsigned int _x, unsigned int y,
-+                             unsigned int _w, unsigned int h);
- 
- // w/h in pixels
- void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-
-From b55c351e6954c800229d97dc6c982ca8f998c848 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 1 Jun 2022 17:49:26 +0000
-Subject: [PATCH 053/136] rpi_sand: Add SAND30->NV12 asm for Armv7 & Armv8
-
-Also reworks the previous Armv8 SAND30->Y16 function in a slightly more
-efficient way that makes it look more like the Armv7 version.
----
- libavutil/aarch64/rpi_sand_neon.S | 549 ++++++++++++++++++------------
- libavutil/aarch64/rpi_sand_neon.h |   4 +
- libavutil/arm/rpi_sand_neon.S     | 239 ++++++++++---
- libavutil/arm/rpi_sand_neon.h     |  11 +
- libavutil/rpi_sand_fns.c          |   2 +-
- 5 files changed, 541 insertions(+), 264 deletions(-)
-
-diff --git a/libavutil/aarch64/rpi_sand_neon.S b/libavutil/aarch64/rpi_sand_neon.S
-index cdcf71ee67..2f07d9674c 100644
---- a/libavutil/aarch64/rpi_sand_neon.S
-+++ b/libavutil/aarch64/rpi_sand_neon.S
-@@ -248,228 +248,6 @@ incomplete_block_loop_end_c8:
-     ret
- endfunc
- 
--//void ff_rpi_sand30_lines_to_planar_y16(
--//  uint8_t * dest,             // [x0]
--//  unsigned int dst_stride,    // [w1] -> assumed to be equal to _w
--//  const uint8_t * src,        // [x2]
--//  unsigned int src_stride1,   // [w3] -> 128
--//  unsigned int src_stride2,   // [w4]
--//  unsigned int _x,            // [w5]
--//  unsigned int y,             // [w6]
--//  unsigned int _w,            // [w7]
--//  unsigned int h);            // [sp, #0]
--
--function ff_rpi_sand30_lines_to_planar_y16, export=1
--    stp x19, x20, [sp, #-48]!
--    stp x21, x22, [sp, #16]
--    stp x23, x24, [sp, #32]
--
--    // w6 = argument h
--    ldr w6, [sp, #48]
--
--    // slice_inc = ((stride2 - 1) * stride1)
--    mov w5, w4
--    sub w5, w5, #1
--    lsl w5, w5, #7
--
--    // total number of bytes per row = (width / 3) * 4
--    mov w8, w7
--    mov w9, #3
--    udiv w8, w8, w9
--    lsl w8, w8, #2
--
--    // number of full 128 byte blocks to be processed
--    mov w9, #96
--    udiv w9, w7, w9 // = (width * 4) / (3*128) = width/96
--
--    // w10 = number of full integers to process (4 bytes)
--    // w11 = remaning zero to two 10bit values still to copy over
--    mov w12, #96
--    mul w12, w9, w12
--    sub w12, w7, w12  // width - blocks*96 = remaining points per row
--    mov w11, #3
--    udiv w10, w12, w11 // full integers to process = w12 / 3 
--    mul w11, w10, w11  // #integers *3
--    sub w11, w12, w11  // remaining 0-2 points = remaining points - integers*3
--
--    // increase w9 by one if w10+w11 is not zero, and decrease the row count by one
--    // this is to efficiently copy incomplete blocks at the end of the rows
--    // the last row is handled explicitly to avoid writing out of bounds
--    add w22, w10, w11
--    cmp w22, #0
--    cset w22, ne // 1 iff w10+w11 not zero, 0 otherwise
--    add w9, w9, w22
--    sub w6, w6, #1
--
--    // store the number of bytes in w20 which we copy too much for every row
--    // when the width of the frame is not a multiple of 96 (128bytes storing 96 10bit values)
--    mov w20, #96*2
--    mul w20, w20, w9
--    sub w20, w1, w20
--
--    mov w23, #0 // flag to check whether the last line had already been processed
--    
--    // bitmask to clear the uppper 6bits of the result values
--    mov x19, #0x03ff03ff03ff03ff
--    dup v22.2d, x19
--
--    // row counter = 0
--    eor w12, w12, w12
--row_loop_y16:
--    cmp w12, w6               // jump to row_loop_y16_fin if we processed all rows
--    bge row_loop_y16_fin
--
--    mov x13, x2               // row src
--    eor w14, w14, w14         // full block counter
--block_loop_y16:
--    cmp w14, w9
--    bge block_loop_y16_fin
--
--    // load 64 bytes
--    ld1 { v0.4s,  v1.4s, v2.4s, v3.4s }, [x13], #64
--   
--    // process v0 and v1
--    xtn v16.4h, v0.4s
--    ushr v0.4s, v0.4s, #10
--    xtn v17.4h, v0.4s
--    ushr v0.4s, v0.4s, #10
--    xtn v18.4h, v0.4s
--   
--    xtn2 v16.8h, v1.4s
--    and v16.16b, v16.16b, v22.16b
--    ushr v1.4s, v1.4s, #10
--    xtn2 v17.8h, v1.4s
--    and v17.16b, v17.16b, v22.16b
--    ushr v1.4s, v1.4s, #10
--    xtn2 v18.8h, v1.4s
--    and v18.16b, v18.16b, v22.16b
--
--    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
--
--    // process v2 and v3
--    xtn v23.4h, v2.4s
--    ushr v2.4s, v2.4s, #10
--    xtn v24.4h, v2.4s
--    ushr v2.4s, v2.4s, #10
--    xtn v25.4h, v2.4s
--    
--    xtn2 v23.8h, v3.4s
--    and v23.16b, v23.16b, v22.16b
--    ushr v3.4s, v3.4s, #10
--    xtn2 v24.8h, v3.4s
--    and v24.16b, v24.16b, v22.16b
--    ushr v3.4s, v3.4s, #10
--    xtn2 v25.8h, v3.4s
--    and v25.16b, v25.16b, v22.16b
--
--    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
--
--    // load the second half of the block -> 64 bytes into registers v4-v7
--    ld1 { v4.4s,  v5.4s,  v6.4s,  v7.4s }, [x13], #64
--    
--    // process v4 and v5
--    xtn v16.4h, v4.4s
--    ushr v4.4s, v4.4s, #10
--    xtn v17.4h, v4.4s
--    ushr v4.4s, v4.4s, #10
--    xtn v18.4h, v4.4s
--   
--    xtn2 v16.8h, v5.4s 
--    and v16.16b, v16.16b, v22.16b
--    ushr v5.4s, v5.4s, #10
--    xtn2 v17.8h, v5.4s
--    and v17.16b, v17.16b, v22.16b
--    ushr v5.4s, v5.4s, #10
--    xtn2 v18.8h, v5.4s
--    and v18.16b, v18.16b, v22.16b
--
--    st3 { v16.8h, v17.8h, v18.8h }, [x0], #48
--
--    // v6 and v7
--    xtn v23.4h, v6.4s
--    ushr v6.4s, v6.4s, #10
--    xtn v24.4h, v6.4s
--    ushr v6.4s, v6.4s, #10
--    xtn v25.4h, v6.4s
--   
--    xtn2 v23.8h, v7.4s 
--    and v23.16b, v23.16b, v22.16b
--    ushr v7.4s, v7.4s, #10
--    xtn2 v24.8h, v7.4s
--    and v24.16b, v24.16b, v22.16b
--    ushr v7.4s, v7.4s, #10
--    xtn2 v25.8h, v7.4s
--    and v25.16b, v25.16b, v22.16b
--
--    st3 { v23.8h, v24.8h, v25.8h }, [x0], #48
-- 
--    add x13, x13, x5          // row src += slice_inc
--    add w14, w14, #1
--    b block_loop_y16
--block_loop_y16_fin:
--
--    
--
--
--    add x2, x2, #128          // src += stride1 (start of the next row)
--    add x0, x0, w20, sxtw     // subtract the bytes we copied too much from dst
--    add w12, w12, #1
--    b row_loop_y16
--row_loop_y16_fin:
--
--    // check whether we have incomplete blocks at the end of every row
--    // in that case decrease row block count by one
--    // change height back to it's original value (meaning increase it by 1)
--    // and jump back to another iteration of row_loop_y16
--
--    cmp w23, #1
--    beq row_loop_y16_fin2 // don't continue here if we already processed the last row
--    add w6, w6, #1    // increase height to the original value
--    sub w9, w9, w22   // block count - 1 or 0, depending on the remaining bytes count
--    mov w23, #1
--    b row_loop_y16
--row_loop_y16_fin2:
--
--    sub x0, x0, w20, sxtw // with the last row we didn't actually move the dst ptr to far ahead, therefore readd the diference
--
--    // now we've got to handle the last block in the last row
--    eor w12, w12, w12 // w12 = 0 = counter
--integer_loop_y16:
--    cmp w12, w10
--    bge integer_loop_y16_fin
--    ldr w14, [x13], #4
--    and w15, w14, #0x3ff
--    strh w15, [x0], #2
--    lsr w14, w14, #10
--    and w15, w14, #0x3ff
--    strh w15, [x0], #2
--    lsr w14, w14, #10
--    and w15, w14, #0x3ff
--    strh w15, [x0], #2
--    add w12, w12, #1
--    b integer_loop_y16
--integer_loop_y16_fin:
--
--final_values_y16:
--    // remaining point count = w11
--    ldr w14, [x13], #4
--    cmp w11, #0
--    beq final_values_y16_fin
--    and w15, w14, #0x3ff
--    strh w15, [x0], #2
--    cmp w11, #1
--    beq final_values_y16_fin
--    lsr w14, w14, #10
--    and w15, w14, #0x3ff
--    strh w15, [x0], #2
--final_values_y16_fin:
--
--    ldp x23, x24, [sp, #32]
--    ldp x21, x22, [sp, #16]
--    ldp x19, x20, [sp], #48
--    ret
--endfunc
--
- //void ff_rpi_sand30_lines_to_planar_c16(
- //  uint8_t * dst_u,            // [x0]
- //  unsigned int dst_stride_u,  // [w1] == _w*2
-@@ -674,3 +452,330 @@ endfunc
- //  unsigned int _w,
- //  unsigned int h);
- 
-+// void ff_rpi_sand30_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+//
-+// Assumes that we are starting on a stripe boundary and that overreading
-+// within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y16, export=1
-+                lsl             w4,  w4,  #7
-+                sub             w4,  w4,  #64
-+                sub             w1,  w1,  w7, lsl #1
-+                uxtw            x6,  w6
-+                add             x8,  x2,  x6, lsl #7
-+                ldr             w6,  [sp, #0]
-+
-+10:
-+                mov             x2,  x8
-+                mov             w5,  w7
-+1:
-+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
-+
-+                subs            w5,  w5,  #96
-+
-+                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #14
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #10
-+
-+                shrn2           v18.8h,  v1.4s,   #14
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #10
-+
-+                ushr            v18.8h,  v18.8h,  #6
-+                bic             v16.8h,  #0xfc,   lsl #8
-+                bic             v17.8h,  #0xfc,   lsl #8
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #14
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #10
-+
-+                shrn2           v21.8h,  v3.4s,   #14
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #10
-+
-+                ushr            v21.8h,  v21.8h,  #6
-+                bic             v19.8h,  #0xfc,   lsl #8
-+                bic             v20.8h,  #0xfc,   lsl #8
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #14
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #10
-+
-+                shrn2           v24.8h,  v5.4s,   #14
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #10
-+
-+                ushr            v24.8h,  v24.8h,  #6
-+                bic             v22.8h,  #0xfc,   lsl #8
-+                bic             v23.8h,  #0xfc,   lsl #8
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #14
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #10
-+
-+                shrn2           v27.8h,  v7.4s,   #14
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #10
-+
-+                ushr            v27.8h,  v27.8h,  #6
-+                bic             v25.8h,  #0xfc,   lsl #8
-+                bic             v26.8h,  #0xfc,   lsl #8
-+
-+                blt             2f
-+
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
-+                st3             {v22.8h, v23.8h, v24.8h}, [x0], #48
-+                st3             {v25.8h, v26.8h, v27.8h}, [x0], #48
-+
-+                bne             1b
-+
-+11:
-+                subs            w6,  w6,  #1
-+                add             x0,  x0,  w1,  uxtw
-+                add             x8,  x8,  #128
-+                bne             10b
-+
-+                ret
-+
-+// Partial final write
-+2:
-+                cmp             w5,  #48-96
-+                blt             1f
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                st3             {v19.8h, v20.8h, v21.8h}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v22.16b
-+                mov             v17.16b, v23.16b
-+                sub             w5,  w5,  #48
-+                mov             v18.16b, v24.16b
-+                mov             v19.16b, v25.16b
-+                mov             v20.16b, v26.16b
-+                mov             v21.16b, v27.16b
-+1:
-+                cmp             w5,  #24-96
-+                blt             1f
-+                st3             {v16.8h, v17.8h, v18.8h}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v19.16b
-+                mov             v17.16b, v20.16b
-+                sub             w5,  w5,  #24
-+                mov             v18.16b, v21.16b
-+1:
-+                cmp             w5,  #12-96
-+                blt             1f
-+                st3             {v16.4h, v17.4h, v18.4h}, [x0], #24
-+                beq             11b
-+                mov             v16.2d[0], v16.2d[1]
-+                sub             w5,  w5,  #12
-+                mov             v17.2d[0], v17.2d[1]
-+                mov             v18.2d[0], v18.2d[1]
-+1:
-+                cmp             w5,  #6-96
-+                blt             1f
-+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
-+                st3             {v16.h, v17.h, v18.h}[1], [x0], #6
-+                beq             11b
-+                mov             v16.2s[0], v16.2s[1]
-+                sub             w5,  w5,  #6
-+                mov             v17.2s[0], v17.2s[1]
-+                mov             v18.2s[0], v18.2s[1]
-+1:
-+                cmp             w5,  #3-96
-+                blt             1f
-+                st3             {v16.h, v17.h, v18.h}[0], [x0], #6
-+                beq             11b
-+                mov             v16.4h[0], v16.4h[1]
-+                sub             w5,  w5,  #3
-+                mov             v17.4h[0], v17.4h[1]
-+1:
-+                cmp             w5,  #2-96
-+                blt             1f
-+                st2             {v16.h, v17.h}[0], [x0], #4
-+                b               11b
-+1:
-+                st1             {v16.h}[0], [x0], #2
-+                b               11b
-+
-+endfunc
-+
-+// void ff_rpi_sand30_lines_to_planar_y8(
-+//   uint8_t * dest,            : x0
-+//   unsigned int dst_stride,   : w1
-+//   const uint8_t * src,       : x2
-+//   unsigned int src_stride1,  : w3, always 128
-+//   unsigned int src_stride2,  : w4
-+//   unsigned int _x,           : w5
-+//   unsigned int y,            : w6
-+//   unsigned int _w,           : w7
-+//   unsigned int h);           : [sp, #0]
-+//
-+// Assumes that we are starting on a stripe boundary and that overreading
-+// within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y8, export=1
-+                lsl             w4,  w4,  #7
-+                sub             w4,  w4,  #64
-+                sub             w1,  w1,  w7
-+                uxtw            x6,  w6
-+                add             x8,  x2,  x6, lsl #7
-+                ldr             w6,  [sp, #0]
-+
-+10:
-+                mov             x2,  x8
-+                mov             w5,  w7
-+1:
-+                ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x2], #64
-+                ld1             {v4.4s, v5.4s, v6.4s, v7.4s}, [x2], x4
-+
-+                subs            w5,  w5,  #96
-+
-+                // v0, v1
-+
-+                shrn            v18.4h,  v0.4s,   #16
-+                xtn             v16.4h,  v0.4s
-+                shrn            v17.4h,  v0.4s,   #12
-+
-+                shrn2           v18.8h,  v1.4s,   #16
-+                xtn2            v16.8h,  v1.4s
-+                shrn2           v17.8h,  v1.4s,   #12
-+
-+                shrn            v18.8b,  v18.8h,  #6
-+                shrn            v16.8b,  v16.8h,  #2
-+                xtn             v17.8b,  v17.8h
-+
-+                // v2, v3
-+
-+                shrn            v21.4h,  v2.4s,   #16
-+                xtn             v19.4h,  v2.4s
-+                shrn            v20.4h,  v2.4s,   #12
-+
-+                shrn2           v21.8h,  v3.4s,   #16
-+                xtn2            v19.8h,  v3.4s
-+                shrn2           v20.8h,  v3.4s,   #12
-+
-+                shrn2           v18.16b, v21.8h,  #6
-+                shrn2           v16.16b, v19.8h,  #2
-+                xtn2            v17.16b, v20.8h
-+
-+                // v4, v5
-+
-+                shrn            v24.4h,  v4.4s,   #16
-+                xtn             v22.4h,  v4.4s
-+                shrn            v23.4h,  v4.4s,   #12
-+
-+                shrn2           v24.8h,  v5.4s,   #16
-+                xtn2            v22.8h,  v5.4s
-+                shrn2           v23.8h,  v5.4s,   #12
-+
-+                shrn            v21.8b,  v24.8h,  #6
-+                shrn            v19.8b,  v22.8h,  #2
-+                xtn             v20.8b,  v23.8h
-+
-+                // v6, v7
-+
-+                shrn            v27.4h,  v6.4s,   #16
-+                xtn             v25.4h,  v6.4s
-+                shrn            v26.4h,  v6.4s,   #12
-+
-+                shrn2           v27.8h,  v7.4s,   #16
-+                xtn2            v25.8h,  v7.4s
-+                shrn2           v26.8h,  v7.4s,   #12
-+
-+                shrn2           v21.16b, v27.8h,  #6
-+                shrn2           v19.16b, v25.8h,  #2
-+                xtn2            v20.16b, v26.8h
-+
-+                blt             2f
-+
-+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
-+                st3             {v19.16b, v20.16b, v21.16b}, [x0], #48
-+
-+                bne             1b
-+
-+11:
-+                subs            w6,  w6,  #1
-+                add             x0,  x0,  w1,  uxtw
-+                add             x8,  x8,  #128
-+                bne             10b
-+
-+                ret
-+
-+// Partial final write
-+2:
-+                cmp             w5,  #48-96
-+                blt             1f
-+                st3             {v16.16b, v17.16b, v18.16b}, [x0], #48
-+                beq             11b
-+                mov             v16.16b, v22.16b
-+                mov             v17.16b, v23.16b
-+                sub             w5,  w5,  #48
-+                mov             v18.16b, v24.16b
-+1:
-+                cmp             w5,  #24-96
-+                blt             1f
-+                st3             {v16.8b, v17.8b, v18.8b}, [x0], #24
-+                beq             11b
-+                mov             v16.2d[0], v16.2d[1]
-+                sub             w5,  w5,  #24
-+                mov             v17.2d[0], v17.2d[1]
-+                mov             v18.2d[0], v18.2d[1]
-+1:
-+                cmp             w5,  #12-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[2], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[3], [x0], #3
-+                beq             11b
-+                mov             v16.2s[0], v16.2s[1]
-+                sub             w5,  w5,  #12
-+                mov             v17.2s[0], v17.2s[1]
-+                mov             v18.2s[0], v18.2s[1]
-+1:
-+                cmp             w5,  #6-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                st3             {v16.b, v17.b, v18.b}[1], [x0], #3
-+                beq             11b
-+                mov             v16.4h[0], v16.4h[1]
-+                sub             w5,  w5,  #6
-+                mov             v17.4h[0], v17.4h[1]
-+                mov             v18.4h[0], v18.4h[1]
-+1:
-+                cmp             w5,  #3-96
-+                blt             1f
-+                st3             {v16.b, v17.b, v18.b}[0], [x0], #3
-+                beq             11b
-+                mov             v16.8b[0], v16.8b[1]
-+                sub             w5,  w5,  #3
-+                mov             v17.8b[0], v17.8b[1]
-+1:
-+                cmp             w5,  #2-96
-+                blt             1f
-+                st2             {v16.b, v17.b}[0], [x0], #2
-+                b               11b
-+1:
-+                st1             {v16.b}[0], [x0], #1
-+                b               11b
-+
-+endfunc
-+
-diff --git a/libavutil/aarch64/rpi_sand_neon.h b/libavutil/aarch64/rpi_sand_neon.h
-index b3aa481ea4..2a56135bc3 100644
---- a/libavutil/aarch64/rpi_sand_neon.h
-+++ b/libavutil/aarch64/rpi_sand_neon.h
-@@ -49,6 +49,10 @@ void ff_rpi_sand30_lines_to_planar_c16(uint8_t * dst_u, unsigned int dst_stride_
-   uint8_t * dst_v, unsigned int dst_stride_v, const uint8_t * src, unsigned int stride1,
-   unsigned int stride2, unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
- 
-+void ff_rpi_sand30_lines_to_planar_y8(uint8_t * dest, unsigned int dst_stride,
-+  const uint8_t * src, unsigned int src_stride1, unsigned int src_stride2,
-+  unsigned int _x, unsigned int y, unsigned int _w, unsigned int h);
-+
- #ifdef __cplusplus
- }
- #endif
-diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
-index 80890fe985..60e697f681 100644
---- a/libavutil/arm/rpi_sand_neon.S
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -360,7 +360,6 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
-                 ldr             r6,  [sp, #36]
-                 ldr             r7,  [sp, #32]  @ y
-                 mov             r12, #48
--                vmov.u16        q15, #0x3ff
-                 sub             r3,  #1
-                 lsl             r3,  #7
-                 sub             r1,  r1,  r6,  lsl #1
-@@ -376,37 +375,33 @@ function ff_rpi_sand30_lines_to_planar_y16, export=1
-                 vldm            r2!, {q10-q13}
-                 add             lr,  #64
- 
--                vshr.u32        q14, q10, #20    @ Cannot vshrn.u32 #20!
-+                vshrn.u32       d4 , q10, #14    @ Cannot vshrn.u32 #20!
-                 ands            lr,  #127
-                 vshrn.u32       d2,  q10, #10
-                 vmovn.u32       d0,  q10
--                vmovn.u32       d4,  q14
- 
--                vshr.u32        q14, q11, #20
-+                vshrn.u32       d5,  q11, #14
-                 it              eq
-                 addeq           r2,  r3
-                 vshrn.u32       d3,  q11, #10
-                 vmovn.u32       d1,  q11
--                vmovn.u32       d5,  q14
- 
-                 subs            r5,  #48
--                vand            q0,  q15
--                vand            q1,  q15
--                vand            q2,  q15
-+                vshr.u16        q2,  #6
-+                vbic.u16        q0,  #0xfc00
-+                vbic.u16        q1,  #0xfc00
- 
--                vshr.u32        q14, q12, #20
-+                vshrn.u32       d20, q12, #14
-                 vshrn.u32       d18, q12, #10
-                 vmovn.u32       d16, q12
--                vmovn.u32       d20, q14
- 
--                vshr.u32        q14, q13, #20
-+                vshrn.u32       d21, q13, #14
-                 vshrn.u32       d19, q13, #10
-                 vmovn.u32       d17, q13
--                vmovn.u32       d21, q14
- 
--                vand            q8,  q15
--                vand            q9,  q15
--                vand            q10, q15
-+                vshr.u16        q10, #6
-+                vbic.u16        q8,  #0xfc00
-+                vbic.u16        q9 , #0xfc00
-                 blt             2f
- 
-                 vst3.16         {d0,  d2,  d4},  [r0], r12
-@@ -499,7 +494,6 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
-                 ldr             r7,  [sp, #48]
-                 ldr             r9,  [sp, #52]
-                 mov             r12, #48
--                vmov.u16        q15, #0x3ff
-                 sub             r8,  #1
-                 lsl             r8,  #7
-                 add             r5,  r5,  r7,  lsl #7
-@@ -515,48 +509,44 @@ function ff_rpi_sand30_lines_to_planar_c16, export=1
-                 add             lr,  #64
- 
-                 @ N.B. unpack [0,1,2] -> (reg order) 1, 0, 2
--                vshr.u32        q14, q0,  #20
--                vshrn.u32       d16, q0,  #10
-+                vshrn.u32       d20, q0,  #14
-                 vmovn.u32       d18, q0
-+                vshrn.u32       d0,  q0,  #10
-                 ands            lr,  #127
--                vmovn.u32       d20, q14
- 
--                vshr.u32        q14, q1,  #20
--                vshrn.u32       d17, q1,  #10
-+                vshrn.u32       d21, q1,  #14
-                 vmovn.u32       d19, q1
--                vmovn.u32       d21, q14
-+                vshrn.u32       d1,  q1,  #10
- 
--                vshr.u32        q14, q2,  #20
-                 vshrn.u32       d22, q2,  #10
--                vmovn.u32       d24, q2
--                vmovn.u32       d26, q14
-+                vmovn.u32       d2,  q2
-+                vshrn.u32       d4,  q2,  #14
- 
--                vshr.u32        q14, q3,  #20
--                vshrn.u32       d23, q3,  #10
--                vmovn.u32       d25, q3
-                 add             r10, r0,  #24
--                vmovn.u32       d27, q14
-+                vshrn.u32       d23, q3,  #10
-+                vmovn.u32       d3,  q3
-+                vshrn.u32       d5,  q3,  #14
- 
-                 it              eq
-                 addeq           r4,  r8
--                vuzp.16         q8,  q11
--                vuzp.16         q9,  q12
--                vuzp.16         q10, q13
-+                vuzp.16         q0,  q11
-+                vuzp.16         q9,  q1
-+                vuzp.16         q10, q2
- 
--                @ q8   V0, V3,.. -> q0
-+                @ q0   V0, V3,..
-                 @ q9   U0, U3...
-                 @ q10  U1, U4...
-                 @ q11  U2, U5,..
--                @ q12  V1, V4,.. -> q1
--                @ q13  V2, V5,.. -> q2
-+                @ q1   V1, V4,
-+                @ q2   V2, V5,..
- 
-                 subs            r6,  #24
--                vand            q11, q15
--                vand            q9,  q15
--                vand            q10, q15
--                vand            q0,  q8,  q15
--                vand            q1,  q12, q15
--                vand            q2,  q13, q15
-+                vbic.u16        q11, #0xfc00
-+                vbic.u16        q9,  #0xfc00
-+                vshr.u16        q10, #6
-+                vshr.u16        q2,  #6
-+                vbic.u16        q0,  #0xfc00
-+                vbic.u16        q1,  #0xfc00
- 
-                 blt             2f
- 
-@@ -765,4 +755,171 @@ function ff_rpi_sand30_lines_to_planar_p010, export=1
- endfunc
- 
- 
-+@ void ff_rpi_sand30_lines_to_planar_y8(
-+@   uint8_t * dest,             // [r0]
-+@   unsigned int dst_stride,    // [r1]
-+@   const uint8_t * src,        // [r2]
-+@   unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+@   unsigned int src_stride2,   // [sp, #0]  -> r3
-+@   unsigned int _x,            // [sp, #4]  Ignored - 0
-+@   unsigned int y,             // [sp, #8]  (r7 in prefix)
-+@   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+@   unsigned int h);            // [sp, #16] -> r7
-+@
-+@ Assumes that we are starting on a stripe boundary and that overreading
-+@ within the stripe is OK. However it does respect the dest size for wri
-+
-+function ff_rpi_sand30_lines_to_planar_y8, export=1
-+                push            {r4-r8, lr}     @ +24
-+                ldr             r3,  [sp, #24]
-+                ldr             r6,  [sp, #36]
-+                ldr             r7,  [sp, #32]  @ y
-+                mov             r12, #48
-+                lsl             r3,  #7
-+                sub             r1,  r1,  r6
-+                add             r8,  r2,  r7,  lsl #7
-+                ldr             r7,  [sp, #40]
-+
-+10:
-+                mov             r2,  r8
-+                add             r4,  r0,  #24
-+                mov             r5,  r6
-+1:
-+                vldm            r2,  {q8-q15}
-+
-+                subs            r5,  #96
-+
-+                vmovn.u32       d0,  q8
-+                vshrn.u32       d2,  q8,  #12
-+                vshrn.u32       d4,  q8,  #16    @ Cannot vshrn.u32 #20!
-+
-+                add             r2,  r3
-+
-+                vmovn.u32       d1,  q9
-+                vshrn.u32       d3,  q9,  #12
-+                vshrn.u32       d5,  q9,  #16
-+
-+                pld             [r2, #0]
-+
-+                vshrn.u16       d0,  q0,  #2
-+                vmovn.u16       d1,  q1
-+                vshrn.u16       d2,  q2,  #6
-+
-+                vmovn.u32       d16, q10
-+                vshrn.u32       d18, q10, #12
-+                vshrn.u32       d20, q10, #16
-+
-+                vmovn.u32       d17, q11
-+                vshrn.u32       d19, q11, #12
-+                vshrn.u32       d21, q11, #16
-+
-+                pld             [r2, #64]
-+
-+                vshrn.u16       d4,  q8,  #2
-+                vmovn.u16       d5,  q9
-+                vshrn.u16       d6,  q10, #6
-+
-+                vmovn.u32       d16, q12
-+                vshrn.u32       d18, q12, #12
-+                vshrn.u32       d20, q12, #16
-+
-+                vmovn.u32       d17, q13
-+                vshrn.u32       d19, q13, #12
-+                vshrn.u32       d21, q13, #16
-+
-+                vshrn.u16       d16, q8,  #2
-+                vmovn.u16       d17, q9
-+                vshrn.u16       d18, q10, #6
-+
-+                vmovn.u32       d20, q14
-+                vshrn.u32       d22, q14, #12
-+                vshrn.u32       d24, q14, #16
-+
-+                vmovn.u32       d21, q15
-+                vshrn.u32       d23, q15, #12
-+                vshrn.u32       d25, q15, #16
-+
-+                vshrn.u16       d20, q10, #2
-+                vmovn.u16       d21, q11
-+                vshrn.u16       d22, q12, #6
-+
-+                blt             2f
-+
-+                vst3.8          {d0,  d1,  d2},  [r0], r12
-+                vst3.8          {d4,  d5,  d6},  [r4], r12
-+                vst3.8          {d16, d17, d18}, [r0], r12
-+                vst3.8          {d20, d21, d22}, [r4], r12
-+
-+                bne             1b
-+
-+11:
-+                subs            r7,  #1
-+                add             r0,  r1
-+                add             r8,  #128
-+                bne             10b
-+
-+                pop             {r4-r8, pc}
-+
-+@ Partial final write
-+2:
-+                cmp             r5,  #48-96
-+                blt             1f
-+                vst3.8          {d0,  d1,  d2},  [r0], r12
-+                vst3.8          {d4,  d5,  d6},  [r4], r12
-+                beq             11b
-+                vmov            q0,  q8
-+                vmov            q2,  q10
-+                sub             r5,  #48
-+                vmov            d2,  d18
-+                vmov            d6,  d22
-+1:
-+                cmp             r5,  #24-96
-+                blt             1f
-+                vst3.8          {d0,  d1,  d2},  [r0]!
-+                beq             11b
-+                vmov            q0,  q2
-+                sub             r5,  #24
-+                vmov            d2,  d6
-+1:
-+                cmp             r5,  #12-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
-+                vst3.8          {d0[2], d1[2], d2[2]}, [r0]!
-+                vst3.8          {d0[3], d1[3], d2[3]}, [r0]!
-+                beq             11b
-+                vmov            s0,  s1
-+                sub             r5,  #12
-+                vmov            s2,  s3
-+                vmov            s4,  s5
-+1:
-+                cmp             r5,  #6-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                vst3.8          {d0[1], d1[1], d2[1]}, [r0]!
-+                add             r0,  #12
-+                beq             11b
-+                vshr.u32        d0,  #16
-+                sub             r5,  #6
-+                vshr.u32        d1,  #16
-+                vshr.u32        d2,  #16
-+1:
-+                cmp             r5, #3-96
-+                blt             1f
-+                vst3.8          {d0[0], d1[0], d2[0]}, [r0]!
-+                beq             11b
-+                sub             r5, #3
-+                vshr.u32        d0, #8
-+                vshr.u32        d1, #8
-+1:
-+                cmp             r5, #2-96
-+                blt             1f
-+                vst2.8          {d0[0], d1[0]}, [r0]!
-+                b               11b
-+1:
-+                vst1.8          {d0[0]}, [r0]!
-+                b               11b
-+
-+endfunc
-+
- 
-diff --git a/libavutil/arm/rpi_sand_neon.h b/libavutil/arm/rpi_sand_neon.h
-index 447f367bea..d457c10870 100644
---- a/libavutil/arm/rpi_sand_neon.h
-+++ b/libavutil/arm/rpi_sand_neon.h
-@@ -95,5 +95,16 @@ void ff_rpi_sand30_lines_to_planar_p010(
-   unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-   unsigned int h);            // [sp, #16] -> r7
- 
-+void ff_rpi_sand30_lines_to_planar_y8(
-+  uint8_t * dest,             // [r0]
-+  unsigned int dst_stride,    // [r1]
-+  const uint8_t * src,        // [r2]
-+  unsigned int src_stride1,   // [r3]      Ignored - assumed 128
-+  unsigned int src_stride2,   // [sp, #0]  -> r3
-+  unsigned int _x,            // [sp, #4]  Ignored - 0
-+  unsigned int y,             // [sp, #8]  (r7 in prefix)
-+  unsigned int _w,            // [sp, #12] -> r6 (cur r5)
-+  unsigned int h);            // [sp, #16] -> r7
-+
- #endif // AVUTIL_ARM_SAND_NEON_H
- 
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-index 256c3d532f..b6071e2928 100644
---- a/libavutil/rpi_sand_fns.c
-+++ b/libavutil/rpi_sand_fns.c
-@@ -247,7 +247,7 @@ void av_rpi_sand30_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-     const uint8_t * p0 = src + (x0 & mask) + y * stride1 + (x0 & ~mask) * stride2;
-     const unsigned int slice_inc = ((stride2 - 1) * stride1) >> 2;  // RHS of a stripe to LHS of next in words
- 
--#if HAVE_SAND_ASM && 0
-+#if HAVE_SAND_ASM
-     if (_x == 0) {
-         ff_rpi_sand30_lines_to_planar_y8(dst, dst_stride, src, stride1, stride2, _x, y, _w, h);
-         return;
-
-From 24c3eef4487a36d5189ecd934b65a7c6a0b53d03 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 7 Jun 2022 14:46:12 +0000
-Subject: [PATCH 054/136] v4l2_m2m_enc: Add the ability to encode DRM_PRIME
- frames
-
----
- libavcodec/v4l2_buffers.c | 100 +++++++++++---
- libavcodec/v4l2_buffers.h |  20 ++-
- libavcodec/v4l2_context.c | 212 +++++++++++++++++++++++++---
- libavcodec/v4l2_context.h |  15 +-
- libavcodec/v4l2_m2m.c     |  37 +++--
- libavcodec/v4l2_m2m.h     |   3 +
- libavcodec/v4l2_m2m_dec.c | 171 ++++++-----------------
- libavcodec/v4l2_m2m_enc.c | 283 +++++++++++++++++++++++++++++++++++++-
- 8 files changed, 643 insertions(+), 198 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 8c4f18dbed..9ef2f40e39 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -29,6 +29,8 @@
- #include <fcntl.h>
- #include <poll.h>
- #include "libavcodec/avcodec.h"
-+#include "libavcodec/internal.h"
-+#include "libavutil/avassert.h"
- #include "libavutil/pixdesc.h"
- #include "libavutil/hwcontext.h"
- #include "v4l2_context.h"
-@@ -60,27 +62,39 @@ static inline AVRational v4l2_get_timebase(const V4L2Buffer * const avbuf)
-     return tb.num && tb.den ? tb : v4l2_timebase;
- }
- 
-+static inline struct timeval tv_from_int(const int64_t t)
-+{
-+    return (struct timeval){
-+        .tv_usec = t % USEC_PER_SEC,
-+        .tv_sec  = t / USEC_PER_SEC
-+    };
-+}
-+
-+static inline int64_t int_from_tv(const struct timeval t)
-+{
-+    return (int64_t)t.tv_sec * USEC_PER_SEC + t.tv_usec;
-+}
-+
- static inline void v4l2_set_pts(V4L2Buffer * const out, const int64_t pts)
- {
-     /* convert pts to v4l2 timebase */
-     const int64_t v4l2_pts =
--        out->context->no_pts_rescale ? pts :
-         pts == AV_NOPTS_VALUE ? 0 :
-             av_rescale_q(pts, v4l2_get_timebase(out), v4l2_timebase);
--    out->buf.timestamp.tv_usec = v4l2_pts % USEC_PER_SEC;
--    out->buf.timestamp.tv_sec = v4l2_pts / USEC_PER_SEC;
-+    out->buf.timestamp = tv_from_int(v4l2_pts);
- }
- 
- static inline int64_t v4l2_get_pts(const V4L2Buffer * const avbuf)
- {
-+    const int64_t v4l2_pts = int_from_tv(avbuf->buf.timestamp);
-+    return v4l2_pts != 0 ? v4l2_pts : AV_NOPTS_VALUE;
-+#if 0
-     /* convert pts back to encoder timebase */
--    const int64_t v4l2_pts = (int64_t)avbuf->buf.timestamp.tv_sec * USEC_PER_SEC +
--                        avbuf->buf.timestamp.tv_usec;
--
-     return
-         avbuf->context->no_pts_rescale ? v4l2_pts :
-         v4l2_pts == 0 ? AV_NOPTS_VALUE :
-             av_rescale_q(v4l2_pts, v4l2_timebase, v4l2_get_timebase(avbuf));
-+#endif
- }
- 
- static void set_buf_length(V4L2Buffer *out, unsigned int plane, uint32_t bytesused, uint32_t length)
-@@ -435,7 +449,7 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)
- 
-         ff_mutex_lock(&ctx->lock);
- 
--        avbuf->status = V4L2BUF_AVAILABLE;
-+        ff_v4l2_buffer_set_avail(avbuf);
- 
-         if (s->draining && V4L2_TYPE_IS_OUTPUT(ctx->type)) {
-             av_log(logger(avbuf), AV_LOG_DEBUG, "%s: Buffer avail\n", ctx->name);
-@@ -599,6 +613,38 @@ static int is_chroma(const AVPixFmtDescriptor *desc, int i, int num_planes)
-     return i != 0  && !(i == num_planes - 1 && (desc->flags & AV_PIX_FMT_FLAG_ALPHA));
- }
- 
-+static int v4l2_buffer_primeframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-+{
-+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
-+
-+    if (frame->format != AV_PIX_FMT_DRM_PRIME || !src)
-+        return AVERROR(EINVAL);
-+
-+    av_assert0(out->buf.memory == V4L2_MEMORY_DMABUF);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(out->buf.type)) {
-+        // Only currently cope with single buffer types
-+        if (out->buf.length != 1)
-+            return AVERROR_PATCHWELCOME;
-+        if (src->nb_objects != 1)
-+            return AVERROR(EINVAL);
-+
-+        out->planes[0].m.fd = src->objects[0].fd;
-+    }
-+    else {
-+        if (src->nb_objects != 1)
-+            return AVERROR(EINVAL);
-+
-+        out->buf.m.fd      = src->objects[0].fd;
-+    }
-+
-+    // No need to copy src AVDescriptor and if we did then we may confuse
-+    // fd close on free
-+    out->ref_buf = av_buffer_ref(frame->buf[0]);
-+
-+    return 0;
-+}
-+
- static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
- {
-     int i;
-@@ -678,7 +724,7 @@ static int v4l2_buffer_swframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-  *
-  ******************************************************************************/
- 
--int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts)
- {
-     out->buf.flags = frame->key_frame ?
-         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-@@ -688,10 +734,15 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out)
-     v4l2_set_color(out, frame->color_primaries, frame->colorspace, frame->color_trc);
-     v4l2_set_color_range(out, frame->color_range);
-     // PTS & interlace are buffer vars
--    v4l2_set_pts(out, frame->pts);
-+    if (track_ts)
-+        out->buf.timestamp = tv_from_int(track_ts);
-+    else
-+        v4l2_set_pts(out, frame->pts);
-     v4l2_set_interlace(out, frame->interlaced_frame, frame->top_field_first);
- 
--    return v4l2_buffer_swframe_to_buf(frame, out);
-+    return frame->format == AV_PIX_FMT_DRM_PRIME ?
-+        v4l2_buffer_primeframe_to_buf(frame, out) :
-+        v4l2_buffer_swframe_to_buf(frame, out);
- }
- 
- int ff_v4l2_buffer_buf_to_avframe(AVFrame *frame, V4L2Buffer *avbuf)
-@@ -754,6 +805,7 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
- 
-     pkt->size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type) ? avbuf->buf.m.planes[0].bytesused : avbuf->buf.bytesused;
-     pkt->data = (uint8_t*)avbuf->plane_info[0].mm_addr + avbuf->planes[0].data_offset;
-+    pkt->flags = 0;
- 
-     if (avbuf->buf.flags & V4L2_BUF_FLAG_KEYFRAME)
-         pkt->flags |= AV_PKT_FLAG_KEY;
-@@ -768,8 +820,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *avbuf)
-     return 0;
- }
- 
--int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
--                                    const void *extdata, size_t extlen)
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
-+                                    const void *extdata, size_t extlen,
-+                                    const int64_t timestamp)
- {
-     int ret;
- 
-@@ -783,7 +836,10 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-     if (ret && ret != AVERROR(ENOMEM))
-         return ret;
- 
--    v4l2_set_pts(out, pkt->pts);
-+    if (timestamp)
-+        out->buf.timestamp = tv_from_int(timestamp);
-+    else
-+        v4l2_set_pts(out, pkt->pts);
- 
-     out->buf.flags = (pkt->flags & AV_PKT_FLAG_KEY) != 0 ?
-         (out->buf.flags | V4L2_BUF_FLAG_KEYFRAME) :
-@@ -794,7 +850,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
- 
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out)
- {
--    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0);
-+    return ff_v4l2_buffer_avpkt_to_buf_ext(pkt, out, NULL, 0, 0);
- }
- 
- 
-@@ -814,13 +870,15 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
-             close(avbuf->drm_frame.objects[i].fd);
-     }
- 
-+    av_buffer_unref(&avbuf->ref_buf);
-+
-     ff_weak_link_unref(&avbuf->context_wl);
- 
-     av_free(avbuf);
- }
- 
- 
--int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx)
-+int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ctx, enum v4l2_memory mem)
- {
-     int ret, i;
-     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
-@@ -837,7 +895,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-     }
- 
-     avbuf->context = ctx;
--    avbuf->buf.memory = V4L2_MEMORY_MMAP;
-+    avbuf->buf.memory = mem;
-     avbuf->buf.type = ctx->type;
-     avbuf->buf.index = index;
- 
-@@ -867,6 +925,8 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-         avbuf->num_planes = 1;
- 
-     for (i = 0; i < avbuf->num_planes; i++) {
-+        const int want_mmap = avbuf->buf.memory == V4L2_MEMORY_MMAP &&
-+            (V4L2_TYPE_IS_OUTPUT(ctx->type) || !buf_to_m2mctx(avbuf)->output_drm);
- 
-         avbuf->plane_info[i].bytesperline = V4L2_TYPE_IS_MULTIPLANAR(ctx->type) ?
-             ctx->format.fmt.pix_mp.plane_fmt[i].bytesperline :
-@@ -875,21 +935,17 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
- 
--            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
--                !buf_to_m2mctx(avbuf)->output_drm) {
-+            if (want_mmap)
-                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-                                                PROT_READ | PROT_WRITE, MAP_SHARED,
-                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
--            }
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
- 
--            if ((V4L2_TYPE_IS_OUTPUT(ctx->type) && buf_to_m2mctx(avbuf)->output_drm) ||
--                !buf_to_m2mctx(avbuf)->output_drm) {
-+            if (want_mmap)
-                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-                                                PROT_READ | PROT_WRITE, MAP_SHARED,
-                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.offset);
--            }
-         }
- 
-         if (avbuf->plane_info[i].mm_addr == MAP_FAILED) {
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 3b7ca4d99e..1ac32c5989 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -59,6 +59,10 @@ typedef struct V4L2Buffer {
- 
-     /* DRM descriptor */
-     AVDRMFrameDescriptor drm_frame;
-+    /* For DRM_PRIME encode - need to keep a ref to the source buffer till we
-+     * are done
-+     */
-+    AVBufferRef * ref_buf;
- 
-     /* keep track of the mmap address and mmap length */
-     struct V4L2Plane_info {
-@@ -110,8 +114,9 @@ int ff_v4l2_buffer_buf_to_avpkt(AVPacket *pkt, V4L2Buffer *buf);
-  */
- int ff_v4l2_buffer_avpkt_to_buf(const AVPacket *pkt, V4L2Buffer *out);
- 
--int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
--                                    const void *extdata, size_t extlen);
-+int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket * const pkt, V4L2Buffer * const out,
-+                                    const void *extdata, size_t extlen,
-+                                    const int64_t timestamp);
- 
- /**
-  * Extracts the data from an AVFrame to a V4L2Buffer
-@@ -121,7 +126,7 @@ int ff_v4l2_buffer_avpkt_to_buf_ext(const AVPacket *pkt, V4L2Buffer *out,
-  *
-  * @returns 0 in case of success, a negative AVERROR code otherwise
-  */
--int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
-+int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out, const int64_t track_ts);
- 
- /**
-  * Initializes a V4L2Buffer
-@@ -131,7 +136,7 @@ int ff_v4l2_buffer_avframe_to_buf(const AVFrame *frame, V4L2Buffer *out);
-  *
-  * @returns 0 in case of success, a negative AVERROR code otherwise
-  */
--int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx);
-+int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context *ctx, enum v4l2_memory mem);
- 
- /**
-  * Enqueues a V4L2Buffer
-@@ -142,5 +147,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef **avbuf, int index, struct V4L2Context
-  */
- int ff_v4l2_buffer_enqueue(V4L2Buffer* avbuf);
- 
-+static inline void
-+ff_v4l2_buffer_set_avail(V4L2Buffer* const avbuf)
-+{
-+    avbuf->status = V4L2BUF_AVAILABLE;
-+    av_buffer_unref(&avbuf->ref_buf);
-+}
-+
- 
- #endif // AVCODEC_V4L2_BUFFERS_H
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index b3662aedaa..7a707d21fc 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -43,6 +43,160 @@ struct v4l2_format_update {
-     int update_avfmt;
- };
- 
-+
-+static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
-+{
-+    return (int64_t)n;
-+}
-+
-+static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
-+{
-+    return (unsigned int)pts;
-+}
-+
-+// FFmpeg requires us to propagate a number of vars from the coded pkt into
-+// the decoded frame. The only thing that tracks like that in V4L2 stateful
-+// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
-+// guarantees about PTS being unique or specified for every frame so replace
-+// the supplied PTS with a simple incrementing number and keep a circular
-+// buffer of all the things we want preserved (including the original PTS)
-+// indexed by the tracking no.
-+static int64_t
-+xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPacket *const avpkt)
-+{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, x->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
-+    x->last_pkt_dts = avpkt->dts;
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pending          = 1,
-+        .pkt_size         = avpkt->size,
-+        .pts              = avpkt->pts,
-+        .dts              = avpkt->dts,
-+        .reordered_opaque = avctx->reordered_opaque,
-+        .pkt_pos          = avpkt->pos,
-+        .pkt_duration     = avpkt->duration,
-+        .track_pts        = track_pts
-+    };
-+    return track_pts;
-+}
-+
-+static int64_t
-+xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFrame *const frame)
-+{
-+    int64_t track_pts;
-+
-+    // Avoid 0
-+    if (++x->track_no == 0)
-+        x->track_no = 1;
-+
-+    track_pts = track_to_pts(avctx, x->track_no);
-+
-+    av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
-+    x->last_pkt_dts = frame->pkt_dts;
-+    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-+        .discard          = 0,
-+        .pending          = 1,
-+        .pkt_size         = 0,
-+        .pts              = frame->pts,
-+        .dts              = AV_NOPTS_VALUE,
-+        .reordered_opaque = frame->reordered_opaque,
-+        .pkt_pos          = frame->pkt_pos,
-+        .pkt_duration     = frame->pkt_duration,
-+        .track_pts        = track_pts
-+    };
-+    return track_pts;
-+}
-+
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_frame_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
-+             AVFrame *const frame)
-+{
-+    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-+    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
-+    {
-+        av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
-+               "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        frame->pts              = AV_NOPTS_VALUE;
-+        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->reordered_opaque = x->last_opaque;
-+        frame->pkt_pos          = -1;
-+        frame->pkt_duration     = 0;
-+        frame->pkt_size         = -1;
-+    }
-+    else if (!t->discard)
-+    {
-+        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
-+        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->reordered_opaque = t->reordered_opaque;
-+        frame->pkt_pos          = t->pkt_pos;
-+        frame->pkt_duration     = t->pkt_duration;
-+        frame->pkt_size         = t->pkt_size;
-+
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (frame->pts != AV_NOPTS_VALUE)
-+            x->last_pts = frame->pts;
-+        t->pending = 0;
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    av_log(avctx, AV_LOG_TRACE, "Out frame PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
-+    return 0;
-+}
-+
-+// Returns -1 if we should discard the frame
-+static int
-+xlat_pts_pkt_out(AVCodecContext *const avctx,
-+             xlat_track_t * const x,
-+             AVPacket *const pkt)
-+{
-+    unsigned int n = pts_to_track(avctx, pkt->pts) % FF_V4L2_M2M_TRACK_SIZE;
-+    V4L2m2mTrackEl *const t = x->track_els + n;
-+    if (pkt->pts == AV_NOPTS_VALUE || pkt->pts != t->track_pts)
-+    {
-+        av_log(avctx, pkt->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
-+               "Pkt tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
-+        pkt->pts                = AV_NOPTS_VALUE;
-+    }
-+    else if (!t->discard)
-+    {
-+        pkt->pts                = t->pending ? t->pts : AV_NOPTS_VALUE;
-+
-+        x->last_opaque = x->track_els[n].reordered_opaque;
-+        if (pkt->pts != AV_NOPTS_VALUE)
-+            x->last_pts = pkt->pts;
-+        t->pending = 0;
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_DEBUG, "Discard packet (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", pkt->pts, n, t->track_pts);
-+        return -1;
-+    }
-+
-+    // * Would like something much better than this...xlat(offset + out_count)?
-+    pkt->dts = pkt->pts;
-+    av_log(avctx, AV_LOG_TRACE, "Out pkt PTS=%" PRId64 ", track=%"PRId64", n=%d\n",
-+           pkt->pts, t->track_pts, n);
-+    return 0;
-+}
-+
-+
- static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
- {
-     return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-@@ -353,12 +507,14 @@ dq_buf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf)
-     atomic_fetch_sub(&ctx->q_count, 1);
- 
-     avbuf = (V4L2Buffer *)ctx->bufrefs[buf.index]->data;
--    avbuf->status = V4L2BUF_AVAILABLE;
-+    ff_v4l2_buffer_set_avail(avbuf);
-     avbuf->buf = buf;
-     if (is_mp) {
-         memcpy(avbuf->planes, planes, sizeof(planes));
-         avbuf->buf.m.planes = avbuf->planes;
-     }
-+    // Done with any attached buffer
-+    av_buffer_unref(&avbuf->ref_buf);
- 
-     if (V4L2_TYPE_IS_CAPTURE(ctx->type)) {
-         // Zero length cap buffer return == EOS
-@@ -733,7 +889,7 @@ static void flush_all_buffers_status(V4L2Context* const ctx)
-     for (i = 0; i < ctx->num_buffers; ++i) {
-         struct V4L2Buffer * const buf = (struct V4L2Buffer *)ctx->bufrefs[i]->data;
-         if (buf->status == V4L2BUF_IN_DRIVER)
--            buf->status = V4L2BUF_AVAILABLE;
-+            ff_v4l2_buffer_set_avail(buf);
-     }
-     atomic_store(&ctx->q_count, 0);
- }
-@@ -787,6 +943,8 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
-     {
-         if (cmd == VIDIOC_STREAMOFF)
-             flush_all_buffers_status(ctx);
-+        else
-+            ctx->first_buf = 1;
- 
-         ctx->streamon = (cmd == VIDIOC_STREAMON);
-         av_log(avctx, AV_LOG_DEBUG, "%s set status %d (%s) OK\n", ctx->name,
-@@ -803,14 +961,16 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd)
- 
- int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
- {
--    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    V4L2m2mContext *const s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-+    int64_t track_ts;
-     V4L2Buffer* avbuf;
-     int ret;
- 
-     if (!frame) {
-         ret = v4l2_stop_encode(ctx);
-         if (ret)
--            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
-+            av_log(avctx, AV_LOG_ERROR, "%s stop_encode\n", ctx->name);
-         s->draining= 1;
-         return 0;
-     }
-@@ -819,7 +979,9 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* frame)
-     if (!avbuf)
-         return AVERROR(EAGAIN);
- 
--    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf);
-+    track_ts = xlat_pts_frame_in(avctx, &s->xlat, frame);
-+
-+    ret = ff_v4l2_buffer_avframe_to_buf(frame, avbuf, track_ts);
-     if (ret)
-         return ret;
- 
-@@ -830,14 +992,16 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-                                    const void * extdata, size_t extlen)
- {
-     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer* avbuf;
-     int ret;
-+    int64_t track_ts;
- 
-     if (!pkt->size) {
-         ret = v4l2_stop_decode(ctx);
-         // Log but otherwise ignore stop failure
-         if (ret)
--            av_log(logger(ctx), AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
-+            av_log(avctx, AV_LOG_ERROR, "%s stop_decode failed: err=%d\n", ctx->name, ret);
-         s->draining = 1;
-         return 0;
-     }
-@@ -846,7 +1010,9 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
-     if (!avbuf)
-         return AVERROR(EAGAIN);
- 
--    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen);
-+    track_ts = xlat_pts_pkt_in(avctx, &s->xlat, pkt);
-+
-+    ret = ff_v4l2_buffer_avpkt_to_buf_ext(pkt, avbuf, extdata, extlen, track_ts);
-     if (ret == AVERROR(ENOMEM))
-         av_log(logger(ctx), AV_LOG_ERROR, "Buffer overflow in %s: pkt->size=%d > buf->length=%d\n",
-                __func__, pkt->size, avbuf->planes[0].length);
-@@ -858,24 +1024,36 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt,
- 
- int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
- {
-+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer *avbuf;
-     int rv;
- 
--    if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
--        return rv;
-+    do {
-+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-+            return rv;
-+        if ((rv = ff_v4l2_buffer_buf_to_avframe(frame, avbuf)) != 0)
-+            return rv;
-+    } while (xlat_pts_frame_out(avctx, &s->xlat, frame) != 0);
- 
--    return ff_v4l2_buffer_buf_to_avframe(frame, avbuf);
-+   return 0;
- }
- 
- int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
- {
-+    V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-+    AVCodecContext *const avctx = s->avctx;
-     V4L2Buffer *avbuf;
-     int rv;
- 
--    if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
--        return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
-+    do {
-+        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+            return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
-+        if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
-+            return rv;
-+    } while (xlat_pts_pkt_out(avctx, &s->xlat, pkt) != 0);
- 
--    return ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf);
-+    return 0;
- }
- 
- int ff_v4l2_context_get_format(V4L2Context* ctx, int probe)
-@@ -951,7 +1129,7 @@ void ff_v4l2_context_release(V4L2Context* ctx)
- }
- 
- 
--static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers)
-+static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers, const enum v4l2_memory mem)
- {
-     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-     struct v4l2_requestbuffers req;
-@@ -962,7 +1140,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
- 
-     memset(&req, 0, sizeof(req));
-     req.count = req_buffers;
--    req.memory = V4L2_MEMORY_MMAP;
-+    req.memory = mem;
-     req.type = ctx->type;
-     while ((ret = ioctl(s->fd, VIDIOC_REQBUFS, &req)) == -1) {
-         if (errno != EINTR) {
-@@ -986,7 +1164,7 @@ static int create_buffers(V4L2Context* const ctx, const unsigned int req_buffers
-     }
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
--        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx);
-+        ret = ff_v4l2_buffer_initialize(&ctx->bufrefs[i], i, ctx, mem);
-         if (ret) {
-             av_log(logger(ctx), AV_LOG_ERROR, "%s buffer[%d] initialization (%s)\n", ctx->name, i, av_err2str(ret));
-             goto fail_release;
-@@ -1052,7 +1230,7 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         goto fail_unref_hwframes;
-     }
- 
--    ret = create_buffers(ctx, ctx->num_buffers);
-+    ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
-     if (ret < 0)
-         goto fail_unref_hwframes;
- 
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 0efff58f18..21265f1bd7 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -91,11 +91,19 @@ typedef struct V4L2Context {
-      */
-     int num_buffers;
- 
-+    /**
-+     * Buffer memory type V4L2_MEMORY_MMAP or V4L2_MEMORY_DMABUF
-+     */
-+    enum v4l2_memory buf_mem;
-+
-     /**
-      * Whether the stream has been started (VIDIOC_STREAMON has been sent).
-      */
-     int streamon;
- 
-+    /* 1st buffer after stream on */
-+    int first_buf;
-+
-     /**
-      *  Either no more buffers available or an unrecoverable error was notified
-      *  by the V4L2 kernel driver: once set the context has to be exited.
-@@ -105,11 +113,10 @@ typedef struct V4L2Context {
-     int flag_last;
- 
-     /**
--     * PTS rescale not wanted
--     * If the PTS is just a dummy frame count then rescale is
--     * actively harmful
-+     * If NZ then when Qing frame/pkt use this rather than the
-+     * "real" PTS
-      */
--    int no_pts_rescale;
-+    uint64_t track_ts;
- 
-     AVBufferRef *frames_ref;
-     atomic_int q_count;
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 6dd01e2e00..1e30d15fd8 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -35,6 +35,14 @@
- #include "v4l2_fmt.h"
- #include "v4l2_m2m.h"
- 
-+static void
-+xlat_init(xlat_track_t * const x)
-+{
-+    memset(x, 0, sizeof(*x));
-+    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+
- static inline int v4l2_splane_video(struct v4l2_capability *cap)
- {
-     if (cap->capabilities & (V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT) &&
-@@ -67,7 +75,9 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
- 
-     s->capture.done = s->output.done = 0;
-     s->capture.name = "capture";
-+    s->capture.buf_mem = V4L2_MEMORY_MMAP;
-     s->output.name = "output";
-+    s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-     atomic_init(&s->refcount, 0);
-     sem_init(&s->refsync, 0, 0);
- 
-@@ -334,35 +344,38 @@ int ff_v4l2_m2m_codec_init(V4L2m2mPriv *priv)
-     return v4l2_configure_contexts(s);
- }
- 
--int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **s)
-+int ff_v4l2_m2m_create_context(V4L2m2mPriv *priv, V4L2m2mContext **pps)
- {
--    *s = av_mallocz(sizeof(V4L2m2mContext));
--    if (!*s)
-+    V4L2m2mContext * const s = av_mallocz(sizeof(V4L2m2mContext));
-+
-+    *pps = NULL;
-+    if (!s)
-         return AVERROR(ENOMEM);
- 
--    priv->context_ref = av_buffer_create((uint8_t *) *s, sizeof(V4L2m2mContext),
-+    priv->context_ref = av_buffer_create((uint8_t *)s, sizeof(*s),
-                                          &v4l2_m2m_destroy_context, NULL, 0);
-     if (!priv->context_ref) {
--        av_freep(s);
-+        av_free(s);
-         return AVERROR(ENOMEM);
-     }
- 
-     /* assign the context */
--    priv->context = *s;
--    (*s)->priv = priv;
-+    priv->context = s;
-+    s->priv = priv;
- 
-     /* populate it */
--    priv->context->capture.num_buffers = priv->num_capture_buffers;
--    priv->context->output.num_buffers  = priv->num_output_buffers;
--    priv->context->self_ref = priv->context_ref;
--    priv->context->fd = -1;
-+    s->capture.num_buffers = priv->num_capture_buffers;
-+    s->output.num_buffers  = priv->num_output_buffers;
-+    s->self_ref = priv->context_ref;
-+    s->fd = -1;
-+    xlat_init(&s->xlat);
- 
-     priv->context->frame = av_frame_alloc();
-     if (!priv->context->frame) {
-         av_buffer_unref(&priv->context_ref);
--        *s = NULL; /* freed when unreferencing context_ref */
-         return AVERROR(ENOMEM);
-     }
- 
-+    *pps = s;
-     return 0;
- }
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 19d618698d..d6cdaf65e1 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -103,6 +103,9 @@ typedef struct V4L2m2mContext {
-     /* generate DRM frames */
-     int output_drm;
- 
-+    /* input frames are drmprime */
-+    int input_drm;
-+
-     /* Frame tracking */
-     xlat_track_t xlat;
-     int pending_hw;
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 7e17044706..fbbfc81342 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -169,96 +169,17 @@ static int v4l2_prepare_decoder(V4L2m2mContext *s)
-     return 0;
- }
- 
--static inline int64_t track_to_pts(AVCodecContext *avctx, unsigned int n)
--{
--    return (int64_t)n;
--}
--
--static inline unsigned int pts_to_track(AVCodecContext *avctx, const int64_t pts)
--{
--    return (unsigned int)pts;
--}
--
--// FFmpeg requires us to propagate a number of vars from the coded pkt into
--// the decoded frame. The only thing that tracks like that in V4L2 stateful
--// is timestamp. PTS maps to timestamp for this decode. FFmpeg makes no
--// guarantees about PTS being unique or specified for every frame so replace
--// the supplied PTS with a simple incrementing number and keep a circular
--// buffer of all the things we want preserved (including the original PTS)
--// indexed by the tracking no.
- static void
--xlat_pts_in(AVCodecContext *const avctx, xlat_track_t *const x, AVPacket *const avpkt)
--{
--    int64_t track_pts;
--
--    // Avoid 0
--    if (++x->track_no == 0)
--        x->track_no = 1;
--
--    track_pts = track_to_pts(avctx, x->track_no);
--
--    av_log(avctx, AV_LOG_TRACE, "In PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
--    x->last_pkt_dts = avpkt->dts;
--    x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
--        .discard          = 0,
--        .pending          = 1,
--        .pkt_size         = avpkt->size,
--        .pts              = avpkt->pts,
--        .dts              = avpkt->dts,
--        .reordered_opaque = avctx->reordered_opaque,
--        .pkt_pos          = avpkt->pos,
--        .pkt_duration     = avpkt->duration,
--        .track_pts        = track_pts
--    };
--    avpkt->pts = track_pts;
--}
--
--// Returns -1 if we should discard the frame
--static int
--xlat_pts_out(AVCodecContext *const avctx,
--             xlat_track_t * const x,
-+set_best_effort_pts(AVCodecContext *const avctx,
-              pts_stats_t * const ps,
-              AVFrame *const frame)
- {
--    unsigned int n = pts_to_track(avctx, frame->pts) % FF_V4L2_M2M_TRACK_SIZE;
--    V4L2m2mTrackEl *const t = x->track_els + n;
--    if (frame->pts == AV_NOPTS_VALUE || frame->pts != t->track_pts)
--    {
--        av_log(avctx, AV_LOG_INFO, "Tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
--        frame->pts              = AV_NOPTS_VALUE;
--        frame->pkt_dts          = x->last_pkt_dts;
--        frame->reordered_opaque = x->last_opaque;
--        frame->pkt_pos          = -1;
--        frame->pkt_duration     = 0;
--        frame->pkt_size         = -1;
--    }
--    else if (!t->discard)
--    {
--        frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
--        frame->pkt_dts          = x->last_pkt_dts;
--        frame->reordered_opaque = t->reordered_opaque;
--        frame->pkt_pos          = t->pkt_pos;
--        frame->pkt_duration     = t->pkt_duration;
--        frame->pkt_size         = t->pkt_size;
--
--        x->last_opaque = x->track_els[n].reordered_opaque;
--        if (frame->pts != AV_NOPTS_VALUE)
--            x->last_pts = frame->pts;
--        t->pending = 0;
--    }
--    else
--    {
--        av_log(avctx, AV_LOG_DEBUG, "Discard frame (flushed): pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
--        return -1;
--    }
--
-     pts_stats_add(ps, frame->pts);
- 
-     frame->best_effort_timestamp = pts_stats_guess(ps);
-     frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
--    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 ", track=%"PRId64", n=%d\n",
--           frame->pts, frame->best_effort_timestamp, frame->pkt_dts, t->track_pts, n);
--    return 0;
-+    av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
-+           frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
- }
- 
- static void
-@@ -272,13 +193,6 @@ xlat_flush(xlat_track_t * const x)
-     x->last_pts = AV_NOPTS_VALUE;
- }
- 
--static void
--xlat_init(xlat_track_t * const x)
--{
--    memset(x, 0, sizeof(*x));
--    x->last_pts = AV_NOPTS_VALUE;
--}
--
- static int
- xlat_pending(const xlat_track_t * const x)
- {
-@@ -419,8 +333,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-             av_log(avctx, AV_LOG_ERROR, "Failed to get coded packet: err=%d\n", ret);
-             return ret;
-         }
--
--        xlat_pts_in(avctx, &s->xlat, &s->buf_pkt);
-     }
- 
-     if (s->draining) {
-@@ -542,49 +454,47 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 prefer_dq ? 5 :
-                 src_rv == NQ_Q_FULL ? -1 : 0;
- 
--            do {
--                // Dequeue frame will unref any previous contents of frame
--                // if it returns success so we don't need an explicit unref
--                // when discarding
--                // This returns AVERROR(EAGAIN) on timeout or if
--                // there is room in the input Q and timeout == -1
--                dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
--
--                // Failure due to no buffer in Q?
--                if (dst_rv == AVERROR(ENOSPC)) {
--                    // Wait & retry
--                    if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
--                        dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
--                    }
-+            // Dequeue frame will unref any previous contents of frame
-+            // if it returns success so we don't need an explicit unref
-+            // when discarding
-+            // This returns AVERROR(EAGAIN) on timeout or if
-+            // there is room in the input Q and timeout == -1
-+            dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-+
-+            // Failure due to no buffer in Q?
-+            if (dst_rv == AVERROR(ENOSPC)) {
-+                // Wait & retry
-+                if ((dst_rv = qbuf_wait(avctx, &s->capture)) == 0) {
-+                    dst_rv = ff_v4l2_context_dequeue_frame(&s->capture, frame, t);
-                 }
-+            }
-+
-+            // Adjust dynamic pending threshold
-+            if (dst_rv == 0) {
-+                if (--s->pending_hw < PENDING_HW_MIN)
-+                    s->pending_hw = PENDING_HW_MIN;
-+                s->pending_n = 0;
- 
--                // Adjust dynamic pending threshold
--                if (dst_rv == 0) {
--                    if (--s->pending_hw < PENDING_HW_MIN)
--                        s->pending_hw = PENDING_HW_MIN;
-+                set_best_effort_pts(avctx, &s->pts_stat, frame);
-+            }
-+            else if (dst_rv == AVERROR(EAGAIN)) {
-+                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
-+                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
-                     s->pending_n = 0;
-                 }
--                else if (dst_rv == AVERROR(EAGAIN)) {
--                    if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
--                        s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
--                        s->pending_n = 0;
--                    }
--                }
-+            }
- 
--                if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
--                    av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
--                    dst_rv = AVERROR_EOF;
--                    s->capture.done = 1;
--                }
--                else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
--                    av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
--                           s->draining, s->capture.done);
--                else if (dst_rv && dst_rv != AVERROR(EAGAIN))
--                    av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
--                           s->draining, s->capture.done, dst_rv);
--
--                // Go again if we got a frame that we need to discard
--            } while (dst_rv == 0 && xlat_pts_out(avctx, &s->xlat, &s->pts_stat, frame));
-+            if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-+                av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-+                dst_rv = AVERROR_EOF;
-+                s->capture.done = 1;
-+            }
-+            else if (dst_rv == AVERROR_EOF && (s->draining || s->capture.done))
-+                av_log(avctx, AV_LOG_DEBUG, "Dequeue EOF: draining=%d, cap.done=%d\n",
-+                       s->draining, s->capture.done);
-+            else if (dst_rv && dst_rv != AVERROR(EAGAIN))
-+                av_log(avctx, AV_LOG_ERROR, "Packet dequeue failure: draining=%d, cap.done=%d, err=%d\n",
-+                       s->draining, s->capture.done, dst_rv);
-         }
- 
-         ++i;
-@@ -791,7 +701,6 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if (ret < 0)
-         return ret;
- 
--    xlat_init(&s->xlat);
-     pts_stats_init(&s->pts_stat, avctx, "decoder");
-     s->pending_hw = PENDING_HW_MIN;
- 
-@@ -810,12 +719,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     output->av_codec_id = avctx->codec_id;
-     output->av_pix_fmt  = AV_PIX_FMT_NONE;
-     output->min_buf_size = max_coded_size(avctx);
--    output->no_pts_rescale = 1;
- 
-     capture->av_codec_id = AV_CODEC_ID_RAWVIDEO;
-     capture->av_pix_fmt = avctx->pix_fmt;
-     capture->min_buf_size = 0;
--    capture->no_pts_rescale = 1;
- 
-     /* the client requests the codec to generate DRM frames:
-      *   - data[0] will therefore point to the returned AVDRMFrameDescriptor
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index 9a0837ecf3..05ff6ba726 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -24,6 +24,8 @@
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- #include <search.h>
-+#include <drm_fourcc.h>
-+
- #include "encode.h"
- #include "libavcodec/avcodec.h"
- #include "libavutil/pixdesc.h"
-@@ -38,6 +40,34 @@
- #define MPEG_CID(x) V4L2_CID_MPEG_VIDEO_##x
- #define MPEG_VIDEO(x) V4L2_MPEG_VIDEO_##x
- 
-+// P030 should be defined in drm_fourcc.h and hopefully will be sometime
-+// in the future but until then...
-+#ifndef DRM_FORMAT_P030
-+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV15
-+#define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5')
-+#endif
-+
-+#ifndef DRM_FORMAT_NV20
-+#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0')
-+#endif
-+
-+#ifndef V4L2_CID_CODEC_BASE
-+#define V4L2_CID_CODEC_BASE V4L2_CID_MPEG_BASE
-+#endif
-+
-+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
-+// in videodev2.h hopefully will be sometime in the future but until then...
-+#ifndef V4L2_PIX_FMT_NV12_10_COL128
-+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
-+#endif
-+
-+#ifndef V4L2_PIX_FMT_NV12_COL128
-+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
-+#endif
-+
- static inline void v4l2_set_timeperframe(V4L2m2mContext *s, unsigned int num, unsigned int den)
- {
-     struct v4l2_streamparm parm = { 0 };
-@@ -148,15 +178,14 @@ static inline int v4l2_mpeg4_profile_from_ff(int p)
- static int v4l2_check_b_frame_support(V4L2m2mContext *s)
- {
-     if (s->avctx->max_b_frames)
--        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support b-frames yet\n");
-+        av_log(s->avctx, AV_LOG_WARNING, "Encoder does not support %d b-frames yet\n", s->avctx->max_b_frames);
- 
--    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), 0, "number of B-frames", 0);
-+    v4l2_set_ext_ctrl(s, MPEG_CID(B_FRAMES), s->avctx->max_b_frames, "number of B-frames", 1);
-     v4l2_get_ext_ctrl(s, MPEG_CID(B_FRAMES), &s->avctx->max_b_frames, "number of B-frames", 0);
-     if (s->avctx->max_b_frames == 0)
-         return 0;
- 
-     avpriv_report_missing_feature(s->avctx, "DTS/PTS calculation for V4L2 encoding");
--
-     return AVERROR_PATCHWELCOME;
- }
- 
-@@ -271,13 +300,184 @@ static int v4l2_prepare_encoder(V4L2m2mContext *s)
-     return 0;
- }
- 
-+static int avdrm_to_v4l2(struct v4l2_format * const format, const AVFrame * const frame)
-+{
-+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
-+
-+    const uint32_t drm_fmt = src->layers[0].format;
-+    // Treat INVALID as LINEAR
-+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
-+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
-+    uint32_t pix_fmt = 0;
-+    uint32_t w = 0;
-+    uint32_t h = 0;
-+    uint32_t bpl = src->layers[0].planes[0].pitch;
-+
-+    // We really don't expect multiple layers
-+    // All formats that we currently cope with are single object
-+
-+    if (src->nb_layers != 1 || src->nb_objects != 1)
-+        return AVERROR(EINVAL);
-+
-+    switch (drm_fmt) {
-+        case DRM_FORMAT_YUV420:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 3)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_YUV420;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            break;
-+
-+        case DRM_FORMAT_NV12:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
-+                w = bpl;
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        case DRM_FORMAT_P030:
-+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
-+                w = bpl / 2;  // Matching lie to how we construct this
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        default:
-+            break;
-+    }
-+
-+    if (!pix_fmt)
-+        return AVERROR(EINVAL);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
-+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->plane_fmt[0].bytesperline = bpl;
-+        pix->num_planes = 1;
-+    }
-+    else {
-+        struct v4l2_pix_format *const pix = &format->fmt.pix;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->bytesperline = bpl;
-+    }
-+
-+    return 0;
-+}
-+
-+// Do we have similar enough formats to be usable?
-+static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format * const b)
-+{
-+    if (a->type != b->type)
-+        return 0;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(a->type)) {
-+        const struct v4l2_pix_format_mplane *const pa = &a->fmt.pix_mp;
-+        const struct v4l2_pix_format_mplane *const pb = &b->fmt.pix_mp;
-+        unsigned int i;
-+        if (pa->pixelformat != pb->pixelformat ||
-+            pa->num_planes != pb->num_planes)
-+            return 0;
-+        for (i = 0; i != pa->num_planes; ++i) {
-+            if (pa->plane_fmt[i].bytesperline != pb->plane_fmt[i].bytesperline)
-+                return 0;
-+        }
-+    }
-+    else {
-+        const struct v4l2_pix_format *const pa = &a->fmt.pix;
-+        const struct v4l2_pix_format *const pb = &b->fmt.pix;
-+        if (pa->pixelformat != pb->pixelformat ||
-+            pa->bytesperline != pb->bytesperline)
-+            return 0;
-+    }
-+    return 1;
-+}
-+
-+
- static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const output = &s->output;
- 
-+    // Signal EOF if needed
-+    if (!frame) {
-+        return ff_v4l2_context_enqueue_frame(output, frame);
-+    }
-+
-+    if (s->input_drm && !output->streamon) {
-+        int rv;
-+        struct v4l2_format req_format = {.type = output->format.type};
-+
-+        // Set format when we first get a buffer
-+        if ((rv = avdrm_to_v4l2(&req_format, frame)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to get V4L2 format from DRM_PRIME frame\n");
-+            return rv;
-+        }
-+
-+        ff_v4l2_context_release(output);
-+
-+        output->format = req_format;
-+
-+        if ((rv = ff_v4l2_context_set_format(output)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to set V4L2 format\n");
-+            return rv;
-+        }
-+
-+        if (!fmt_eq(&req_format, &output->format)) {
-+            av_log(avctx, AV_LOG_ERROR, "Format mismatch after setup\n");
-+            return AVERROR(EINVAL);
-+        }
-+
-+        output->selection.top = frame->crop_top;
-+        output->selection.left = frame->crop_left;
-+        output->selection.width = av_frame_cropped_width(frame);
-+        output->selection.height = av_frame_cropped_height(frame);
-+
-+        if ((rv = ff_v4l2_context_init(output)) != 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to (re)init context\n");
-+            return rv;
-+        }
-+
-+        {
-+            struct v4l2_selection selection = {
-+                .type = V4L2_BUF_TYPE_VIDEO_OUTPUT,
-+                .target = V4L2_SEL_TGT_CROP,
-+                .r = output->selection
-+            };
-+            if (ioctl(s->fd, VIDIOC_S_SELECTION, &selection) != 0) {
-+                av_log(avctx, AV_LOG_WARNING, "S_SELECTION (CROP) %dx%d @ %d,%d failed: %s\n",
-+                       selection.r.width, selection.r.height, selection.r.left, selection.r.top,
-+                       av_err2str(AVERROR(errno)));
-+            }
-+            av_log(avctx, AV_LOG_TRACE, "S_SELECTION (CROP) %dx%d @ %d,%d OK\n",
-+                   selection.r.width, selection.r.height, selection.r.left, selection.r.top);
-+        }
-+    }
-+
- #ifdef V4L2_CID_MPEG_VIDEO_FORCE_KEY_FRAME
--    if (frame && frame->pict_type == AV_PICTURE_TYPE_I)
-+    if (frame->pict_type == AV_PICTURE_TYPE_I)
-         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
- #endif
- 
-@@ -328,7 +528,70 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     }
- 
- dequeue:
--    return ff_v4l2_context_dequeue_packet(capture, avpkt);
-+    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+        return ret;
-+
-+    if (capture->first_buf == 1) {
-+        uint8_t * data;
-+        const int len = avpkt->size;
-+
-+        // 1st buffer after streamon should be SPS/PPS
-+        capture->first_buf = 2;
-+
-+        // Clear both possible stores so there is no chance of confusion
-+        av_freep(&s->extdata_data);
-+        s->extdata_size = 0;
-+        av_freep(&avctx->extradata);
-+        avctx->extradata_size = 0;
-+
-+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
-+            memcpy(data, avpkt->data, len);
-+
-+        av_packet_unref(avpkt);
-+
-+        if (data == NULL)
-+            return AVERROR(ENOMEM);
-+
-+        // We need to copy the header, but keep local if not global
-+        if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
-+            avctx->extradata = data;
-+            avctx->extradata_size = len;
-+        }
-+        else {
-+            s->extdata_data = data;
-+            s->extdata_size = len;
-+        }
-+
-+        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+            return ret;
-+    }
-+
-+    // First frame must be key so mark as such even if encoder forgot
-+    if (capture->first_buf == 2)
-+        avpkt->flags |= AV_PKT_FLAG_KEY;
-+
-+    // Add SPS/PPS to the start of every key frame if non-global headers
-+    if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
-+        const size_t newlen = s->extdata_size + avpkt->size;
-+        AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
-+
-+        if (buf == NULL) {
-+            av_packet_unref(avpkt);
-+            return AVERROR(ENOMEM);
-+        }
-+
-+        memcpy(buf->data, s->extdata_data, s->extdata_size);
-+        memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
-+
-+        av_buffer_unref(&avpkt->buf);
-+        avpkt->buf = buf;
-+        avpkt->data = buf->data;
-+        avpkt->size = newlen;
-+    }
-+
-+//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
-+    capture->first_buf = 0;
-+    return 0;
- }
- 
- static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-@@ -340,6 +603,8 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-     uint32_t v4l2_fmt_output;
-     int ret;
- 
-+    av_log(avctx, AV_LOG_INFO, " <<< %s: fmt=%d/%d\n", __func__, avctx->pix_fmt, avctx->sw_pix_fmt);
-+
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
-@@ -347,13 +612,17 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-     capture = &s->capture;
-     output  = &s->output;
- 
-+    s->input_drm = (avctx->pix_fmt == AV_PIX_FMT_DRM_PRIME);
-+
-     /* common settings output/capture */
-     output->height = capture->height = avctx->height;
-     output->width = capture->width = avctx->width;
- 
-     /* output context */
-     output->av_codec_id = AV_CODEC_ID_RAWVIDEO;
--    output->av_pix_fmt = avctx->pix_fmt;
-+    output->av_pix_fmt = !s->input_drm ? avctx->pix_fmt :
-+            avctx->sw_pix_fmt != AV_PIX_FMT_NONE ? avctx->sw_pix_fmt :
-+            AV_PIX_FMT_YUV420P;
- 
-     /* capture context */
-     capture->av_codec_id = avctx->codec_id;
-@@ -372,7 +641,7 @@ static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-         v4l2_fmt_output = output->format.fmt.pix.pixelformat;
- 
-     pix_fmt_output = ff_v4l2_format_v4l2_to_avfmt(v4l2_fmt_output, AV_CODEC_ID_RAWVIDEO);
--    if (pix_fmt_output != avctx->pix_fmt) {
-+    if (!s->input_drm && pix_fmt_output != avctx->pix_fmt) {
-         const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt_output);
-         av_log(avctx, AV_LOG_ERROR, "Encoder requires %s pixel format.\n", desc->name);
-         return AVERROR(EINVAL);
-
-From 6b437ce70582c67971aa81871a6694a08b709784 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 8 Jun 2022 16:13:31 +0000
-Subject: [PATCH 055/136] v4l2_m2m_dec: Use DTS for best effort PTS if PTS is
- always NO_PTS
-
-If we do have DTS but don't have PTS then assume PTS=DTS.
-Also get rid of last_dts from tracking as its info wasn't actually
-useful in any way.
----
- libavcodec/v4l2_context.c | 6 ++----
- libavcodec/v4l2_m2m.h     | 1 -
- libavcodec/v4l2_m2m_dec.c | 8 +++++++-
- 3 files changed, 9 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 7a707d21fc..6b97eab41e 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -73,7 +73,6 @@ xlat_pts_pkt_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVPack
-     track_pts = track_to_pts(avctx, x->track_no);
- 
-     av_log(avctx, AV_LOG_TRACE, "In pkt PTS=%" PRId64 ", DTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", avpkt->pts, avpkt->dts, track_pts, x->track_no);
--    x->last_pkt_dts = avpkt->dts;
-     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-         .discard          = 0,
-         .pending          = 1,
-@@ -100,7 +99,6 @@ xlat_pts_frame_in(AVCodecContext *const avctx, xlat_track_t *const x, const AVFr
-     track_pts = track_to_pts(avctx, x->track_no);
- 
-     av_log(avctx, AV_LOG_TRACE, "In frame PTS=%" PRId64 ", track=%" PRId64 ", n=%u\n", frame->pts, track_pts, x->track_no);
--    x->last_pkt_dts = frame->pkt_dts;
-     x->track_els[x->track_no  % FF_V4L2_M2M_TRACK_SIZE] = (V4L2m2mTrackEl){
-         .discard          = 0,
-         .pending          = 1,
-@@ -129,7 +127,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
-         av_log(avctx, frame->pts == AV_NOPTS_VALUE ? AV_LOG_DEBUG : AV_LOG_WARNING,
-                "Frame tracking failure: pts=%" PRId64 ", track[%d]=%" PRId64 "\n", frame->pts, n, t->track_pts);
-         frame->pts              = AV_NOPTS_VALUE;
--        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->pkt_dts          = AV_NOPTS_VALUE;
-         frame->reordered_opaque = x->last_opaque;
-         frame->pkt_pos          = -1;
-         frame->pkt_duration     = 0;
-@@ -138,7 +136,7 @@ xlat_pts_frame_out(AVCodecContext *const avctx,
-     else if (!t->discard)
-     {
-         frame->pts              = t->pending ? t->pts : AV_NOPTS_VALUE;
--        frame->pkt_dts          = x->last_pkt_dts;
-+        frame->pkt_dts          = t->dts;
-         frame->reordered_opaque = t->reordered_opaque;
-         frame->pkt_pos          = t->pkt_pos;
-         frame->pkt_duration     = t->pkt_duration;
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index d6cdaf65e1..ee72beb052 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -67,7 +67,6 @@ typedef struct pts_stats_s
- typedef struct xlat_track_s {
-     unsigned int track_no;
-     int64_t last_pts;
--    int64_t last_pkt_dts;
-     int64_t last_opaque;
-     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
- } xlat_track_t;
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index fbbfc81342..485a96f4b4 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -177,7 +177,13 @@ set_best_effort_pts(AVCodecContext *const avctx,
-     pts_stats_add(ps, frame->pts);
- 
-     frame->best_effort_timestamp = pts_stats_guess(ps);
--    frame->pkt_dts               = frame->pts;  // We can't emulate what s/w does in a useful manner?
-+    // If we can't guess from just PTS - try DTS
-+    if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
-+        frame->best_effort_timestamp = frame->pkt_dts;
-+
-+    // We can't emulate what s/w does in a useful manner and using the
-+    // "correct" answer seems to just confuse things.
-+    frame->pkt_dts               = frame->pts;
-     av_log(avctx, AV_LOG_TRACE, "Out PTS=%" PRId64 "/%"PRId64", DTS=%" PRId64 "\n",
-            frame->pts, frame->best_effort_timestamp, frame->pkt_dts);
- }
-
-From ec8d1c2c0b6bd3544e5e30500a167fc31abde17a Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 30 Jun 2022 15:59:23 +0000
-Subject: [PATCH 056/136] v4l2: Update H265 request for current API
-
-This works with v9 of the H265 patch set which hopefully will be the
-last one. Hevc controls extracted from patched v4l2-controls into
-hevc-ctrls-v4 - if HEVC controls found in the system v4l2-controls then
-those will be used instead.
----
- libavcodec/Makefile            |   2 +-
- libavcodec/hevc-ctrls-v4.h     | 515 +++++++++++++++++++++++++++++++++
- libavcodec/v4l2_req_hevc_v4.c  |   3 +
- libavcodec/v4l2_req_hevc_vx.c  |  81 ++++--
- libavcodec/v4l2_request_hevc.c |   6 +-
- libavcodec/v4l2_request_hevc.h |   1 +
- 6 files changed, 583 insertions(+), 25 deletions(-)
- create mode 100644 libavcodec/hevc-ctrls-v4.h
- create mode 100644 libavcodec/v4l2_req_hevc_v4.c
-
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 2b3c16185d..d433a71236 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -1000,7 +1000,7 @@ OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
- OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
--                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
-+                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
-new file mode 100644
-index 0000000000..7e05f6e7c3
---- /dev/null
-+++ b/libavcodec/hevc-ctrls-v4.h
-@@ -0,0 +1,515 @@
-+/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
-+/*
-+ *  Video for Linux Two controls header file
-+ *
-+ *  Copyright (C) 1999-2012 the contributors
-+ *
-+ *  This program is free software; you can redistribute it and/or modify
-+ *  it under the terms of the GNU General Public License as published by
-+ *  the Free Software Foundation; either version 2 of the License, or
-+ *  (at your option) any later version.
-+ *
-+ *  This program is distributed in the hope that it will be useful,
-+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-+ *  GNU General Public License for more details.
-+ *
-+ *  Alternatively you can redistribute this file under the terms of the
-+ *  BSD license as stated below:
-+ *
-+ *  Redistribution and use in source and binary forms, with or without
-+ *  modification, are permitted provided that the following conditions
-+ *  are met:
-+ *  1. Redistributions of source code must retain the above copyright
-+ *     notice, this list of conditions and the following disclaimer.
-+ *  2. Redistributions in binary form must reproduce the above copyright
-+ *     notice, this list of conditions and the following disclaimer in
-+ *     the documentation and/or other materials provided with the
-+ *     distribution.
-+ *  3. The names of its contributors may not be used to endorse or promote
-+ *     products derived from this software without specific prior written
-+ *     permission.
-+ *
-+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+ *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+ *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-+ *  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-+ *  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-+ *  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
-+ *  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-+ *  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-+ *  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-+ *  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+ *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+ *
-+ *  The contents of this header was split off from videodev2.h. All control
-+ *  definitions should be added to this header, which is included by
-+ *  videodev2.h.
-+ */
-+
-+#ifndef AVCODEC_HEVC_CTRLS_V4_H
-+#define AVCODEC_HEVC_CTRLS_V4_H
-+
-+#include <linux/const.h>
-+#include <linux/types.h>
-+
-+#define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
-+#define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
-+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
-+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX	(V4L2_CID_CODEC_STATELESS_BASE + 403)
-+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 404)
-+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE	(V4L2_CID_CODEC_STATELESS_BASE + 405)
-+#define V4L2_CID_STATELESS_HEVC_START_CODE	(V4L2_CID_CODEC_STATELESS_BASE + 406)
-+#define V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS (V4L2_CID_CODEC_STATELESS_BASE + 407)
-+
-+enum v4l2_stateless_hevc_decode_mode {
-+	V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED,
-+	V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED,
-+};
-+
-+enum v4l2_stateless_hevc_start_code {
-+	V4L2_STATELESS_HEVC_START_CODE_NONE,
-+	V4L2_STATELESS_HEVC_START_CODE_ANNEX_B,
-+};
-+
-+#define V4L2_HEVC_SLICE_TYPE_B	0
-+#define V4L2_HEVC_SLICE_TYPE_P	1
-+#define V4L2_HEVC_SLICE_TYPE_I	2
-+
-+#define V4L2_HEVC_SPS_FLAG_SEPARATE_COLOUR_PLANE		(1ULL << 0)
-+#define V4L2_HEVC_SPS_FLAG_SCALING_LIST_ENABLED			(1ULL << 1)
-+#define V4L2_HEVC_SPS_FLAG_AMP_ENABLED				(1ULL << 2)
-+#define V4L2_HEVC_SPS_FLAG_SAMPLE_ADAPTIVE_OFFSET		(1ULL << 3)
-+#define V4L2_HEVC_SPS_FLAG_PCM_ENABLED				(1ULL << 4)
-+#define V4L2_HEVC_SPS_FLAG_PCM_LOOP_FILTER_DISABLED		(1ULL << 5)
-+#define V4L2_HEVC_SPS_FLAG_LONG_TERM_REF_PICS_PRESENT		(1ULL << 6)
-+#define V4L2_HEVC_SPS_FLAG_SPS_TEMPORAL_MVP_ENABLED		(1ULL << 7)
-+#define V4L2_HEVC_SPS_FLAG_STRONG_INTRA_SMOOTHING_ENABLED	(1ULL << 8)
-+
-+/**
-+ * struct v4l2_ctrl_hevc_sps - ITU-T Rec. H.265: Sequence parameter set
-+ *
-+ * @video_parameter_set_id: specifies the value of the
-+ *			vps_video_parameter_set_id of the active VPS
-+ * @seq_parameter_set_id: provides an identifier for the SPS for
-+ *			  reference by other syntax elements
-+ * @pic_width_in_luma_samples:	specifies the width of each decoded picture
-+ *				in units of luma samples
-+ * @pic_height_in_luma_samples: specifies the height of each decoded picture
-+ *				in units of luma samples
-+ * @bit_depth_luma_minus8: this value plus 8specifies the bit depth of the
-+ *                         samples of the luma array
-+ * @bit_depth_chroma_minus8: this value plus 8 specifies the bit depth of the
-+ *                           samples of the chroma arrays
-+ * @log2_max_pic_order_cnt_lsb_minus4: this value plus 4 specifies the value of
-+ *                                     the variable MaxPicOrderCntLsb
-+ * @sps_max_dec_pic_buffering_minus1: this value plus 1 specifies the maximum
-+ *                                    required size of the decoded picture
-+ *                                    buffer for the codec video sequence
-+ * @sps_max_num_reorder_pics: indicates the maximum allowed number of pictures
-+ * @sps_max_latency_increase_plus1: not equal to 0 is used to compute the
-+ *				    value of SpsMaxLatencyPictures array
-+ * @log2_min_luma_coding_block_size_minus3: plus 3 specifies the minimum
-+ *					    luma coding block size
-+ * @log2_diff_max_min_luma_coding_block_size: specifies the difference between
-+ *					      the maximum and minimum luma
-+ *					      coding block size
-+ * @log2_min_luma_transform_block_size_minus2: plus 2 specifies the minimum luma
-+ *					       transform block size
-+ * @log2_diff_max_min_luma_transform_block_size: specifies the difference between
-+ *						 the maximum and minimum luma
-+ *						 transform block size
-+ * @max_transform_hierarchy_depth_inter: specifies the maximum hierarchy
-+ *					 depth for transform units of
-+ *					 coding units coded in inter
-+ *					 prediction mode
-+ * @max_transform_hierarchy_depth_intra: specifies the maximum hierarchy
-+ *					 depth for transform units of
-+ *					 coding units coded in intra
-+ *					 prediction mode
-+ * @pcm_sample_bit_depth_luma_minus1: this value plus 1 specifies the number of
-+ *                                    bits used to represent each of PCM sample
-+ *                                    values of the luma component
-+ * @pcm_sample_bit_depth_chroma_minus1: this value plus 1 specifies the number
-+ *                                      of bits used to represent each of PCM
-+ *                                      sample values of the chroma components
-+ * @log2_min_pcm_luma_coding_block_size_minus3: this value plus 3 specifies the
-+ *                                              minimum size of coding blocks
-+ * @log2_diff_max_min_pcm_luma_coding_block_size: specifies the difference between
-+ *						  the maximum and minimum size of
-+ *						  coding blocks
-+ * @num_short_term_ref_pic_sets: specifies the number of st_ref_pic_set()
-+ *				 syntax structures included in the SPS
-+ * @num_long_term_ref_pics_sps: specifies the number of candidate long-term
-+ *				reference pictures that are specified in the SPS
-+ * @chroma_format_idc: specifies the chroma sampling
-+ * @sps_max_sub_layers_minus1: this value plus 1 specifies the maximum number
-+ *                             of temporal sub-layers
-+ * @reserved: padding field. Should be zeroed by applications.
-+ * @flags: see V4L2_HEVC_SPS_FLAG_{}
-+ */
-+struct v4l2_ctrl_hevc_sps {
-+	__u8	video_parameter_set_id;
-+	__u8	seq_parameter_set_id;
-+	__u16	pic_width_in_luma_samples;
-+	__u16	pic_height_in_luma_samples;
-+	__u8	bit_depth_luma_minus8;
-+	__u8	bit_depth_chroma_minus8;
-+	__u8	log2_max_pic_order_cnt_lsb_minus4;
-+	__u8	sps_max_dec_pic_buffering_minus1;
-+	__u8	sps_max_num_reorder_pics;
-+	__u8	sps_max_latency_increase_plus1;
-+	__u8	log2_min_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_luma_coding_block_size;
-+	__u8	log2_min_luma_transform_block_size_minus2;
-+	__u8	log2_diff_max_min_luma_transform_block_size;
-+	__u8	max_transform_hierarchy_depth_inter;
-+	__u8	max_transform_hierarchy_depth_intra;
-+	__u8	pcm_sample_bit_depth_luma_minus1;
-+	__u8	pcm_sample_bit_depth_chroma_minus1;
-+	__u8	log2_min_pcm_luma_coding_block_size_minus3;
-+	__u8	log2_diff_max_min_pcm_luma_coding_block_size;
-+	__u8	num_short_term_ref_pic_sets;
-+	__u8	num_long_term_ref_pics_sps;
-+	__u8	chroma_format_idc;
-+	__u8	sps_max_sub_layers_minus1;
-+
-+	__u8	reserved[6];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_PPS_FLAG_DEPENDENT_SLICE_SEGMENT_ENABLED	(1ULL << 0)
-+#define V4L2_HEVC_PPS_FLAG_OUTPUT_FLAG_PRESENT			(1ULL << 1)
-+#define V4L2_HEVC_PPS_FLAG_SIGN_DATA_HIDING_ENABLED		(1ULL << 2)
-+#define V4L2_HEVC_PPS_FLAG_CABAC_INIT_PRESENT			(1ULL << 3)
-+#define V4L2_HEVC_PPS_FLAG_CONSTRAINED_INTRA_PRED		(1ULL << 4)
-+#define V4L2_HEVC_PPS_FLAG_TRANSFORM_SKIP_ENABLED		(1ULL << 5)
-+#define V4L2_HEVC_PPS_FLAG_CU_QP_DELTA_ENABLED			(1ULL << 6)
-+#define V4L2_HEVC_PPS_FLAG_PPS_SLICE_CHROMA_QP_OFFSETS_PRESENT	(1ULL << 7)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_PRED			(1ULL << 8)
-+#define V4L2_HEVC_PPS_FLAG_WEIGHTED_BIPRED			(1ULL << 9)
-+#define V4L2_HEVC_PPS_FLAG_TRANSQUANT_BYPASS_ENABLED		(1ULL << 10)
-+#define V4L2_HEVC_PPS_FLAG_TILES_ENABLED			(1ULL << 11)
-+#define V4L2_HEVC_PPS_FLAG_ENTROPY_CODING_SYNC_ENABLED		(1ULL << 12)
-+#define V4L2_HEVC_PPS_FLAG_LOOP_FILTER_ACROSS_TILES_ENABLED	(1ULL << 13)
-+#define V4L2_HEVC_PPS_FLAG_PPS_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 14)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_OVERRIDE_ENABLED	(1ULL << 15)
-+#define V4L2_HEVC_PPS_FLAG_PPS_DISABLE_DEBLOCKING_FILTER	(1ULL << 16)
-+#define V4L2_HEVC_PPS_FLAG_LISTS_MODIFICATION_PRESENT		(1ULL << 17)
-+#define V4L2_HEVC_PPS_FLAG_SLICE_SEGMENT_HEADER_EXTENSION_PRESENT (1ULL << 18)
-+#define V4L2_HEVC_PPS_FLAG_DEBLOCKING_FILTER_CONTROL_PRESENT	(1ULL << 19)
-+#define V4L2_HEVC_PPS_FLAG_UNIFORM_SPACING			(1ULL << 20)
-+
-+/**
-+ * struct v4l2_ctrl_hevc_pps - ITU-T Rec. H.265: Picture parameter set
-+ *
-+ * @pic_parameter_set_id: identifies the PPS for reference by other
-+ *			  syntax elements
-+ * @num_extra_slice_header_bits: specifies the number of extra slice header
-+ *				 bits that are present in the slice header RBSP
-+ *				 for coded pictures referring to the PPS.
-+ * @num_ref_idx_l0_default_active_minus1: this value plus 1 specifies the
-+ *                                        inferred value of num_ref_idx_l0_active_minus1
-+ * @num_ref_idx_l1_default_active_minus1: this value plus 1 specifies the
-+ *                                        inferred value of num_ref_idx_l1_active_minus1
-+ * @init_qp_minus26: this value plus 26 specifies the initial value of SliceQp Y for
-+ *		     each slice referring to the PPS
-+ * @diff_cu_qp_delta_depth: specifies the difference between the luma coding
-+ *			    tree block size and the minimum luma coding block
-+ *			    size of coding units that convey cu_qp_delta_abs
-+ *			    and cu_qp_delta_sign_flag
-+ * @pps_cb_qp_offset: specify the offsets to the luma quantization parameter Cb
-+ * @pps_cr_qp_offset: specify the offsets to the luma quantization parameter Cr
-+ * @num_tile_columns_minus1: this value plus 1 specifies the number of tile columns
-+ *			     partitioning the picture
-+ * @num_tile_rows_minus1: this value plus 1 specifies the number of tile rows partitioning
-+ *			  the picture
-+ * @column_width_minus1: this value plus 1 specifies the width of the each tile column in
-+ *			 units of coding tree blocks
-+ * @row_height_minus1: this value plus 1 specifies the height of the each tile row in
-+ *		       units of coding tree blocks
-+ * @pps_beta_offset_div2: specify the default deblocking parameter offsets for
-+ *			  beta divided by 2
-+ * @pps_tc_offset_div2: specify the default deblocking parameter offsets for tC
-+ *			divided by 2
-+ * @log2_parallel_merge_level_minus2: this value plus 2 specifies the value of
-+ *                                    the variable Log2ParMrgLevel
-+ * @reserved: padding field. Should be zeroed by applications.
-+ * @flags: see V4L2_HEVC_PPS_FLAG_{}
-+ */
-+struct v4l2_ctrl_hevc_pps {
-+	__u8	pic_parameter_set_id;
-+	__u8	num_extra_slice_header_bits;
-+	__u8	num_ref_idx_l0_default_active_minus1;
-+	__u8	num_ref_idx_l1_default_active_minus1;
-+	__s8	init_qp_minus26;
-+	__u8	diff_cu_qp_delta_depth;
-+	__s8	pps_cb_qp_offset;
-+	__s8	pps_cr_qp_offset;
-+	__u8	num_tile_columns_minus1;
-+	__u8	num_tile_rows_minus1;
-+	__u8	column_width_minus1[20];
-+	__u8	row_height_minus1[22];
-+	__s8	pps_beta_offset_div2;
-+	__s8	pps_tc_offset_div2;
-+	__u8	log2_parallel_merge_level_minus2;
-+	__u8	reserved;
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DPB_ENTRY_LONG_TERM_REFERENCE	0x01
-+
-+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME				0
-+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_FIELD			1
-+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_FIELD			2
-+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM			3
-+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP			4
-+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_BOTTOM_TOP			5
-+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM		6
-+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_DOUBLING			7
-+#define V4L2_HEVC_SEI_PIC_STRUCT_FRAME_TRIPLING			8
-+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_PREVIOUS_BOTTOM	9
-+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_PREVIOUS_TOP	10
-+#define V4L2_HEVC_SEI_PIC_STRUCT_TOP_PAIRED_NEXT_BOTTOM		11
-+#define V4L2_HEVC_SEI_PIC_STRUCT_BOTTOM_PAIRED_NEXT_TOP		12
-+
-+#define V4L2_HEVC_DPB_ENTRIES_NUM_MAX		16
-+
-+/**
-+ * struct v4l2_hevc_dpb_entry - HEVC decoded picture buffer entry
-+ *
-+ * @timestamp: timestamp of the V4L2 capture buffer to use as reference.
-+ * @flags: long term flag for the reference frame
-+ * @field_pic: whether the reference is a field picture or a frame.
-+ * @reserved: padding field. Should be zeroed by applications.
-+ * @pic_order_cnt_val: the picture order count of the current picture.
-+ */
-+struct v4l2_hevc_dpb_entry {
-+	__u64	timestamp;
-+	__u8	flags;
-+	__u8	field_pic;
-+	__u16	reserved;
-+	__s32	pic_order_cnt_val;
-+};
-+
-+/**
-+ * struct v4l2_hevc_pred_weight_table - HEVC weighted prediction parameters
-+ *
-+ * @delta_luma_weight_l0: the difference of the weighting factor applied
-+ *			  to the luma prediction value for list 0
-+ * @luma_offset_l0: the additive offset applied to the luma prediction value
-+ *		    for list 0
-+ * @delta_chroma_weight_l0: the difference of the weighting factor applied
-+ *			    to the chroma prediction values for list 0
-+ * @chroma_offset_l0: the difference of the additive offset applied to
-+ *		      the chroma prediction values for list 0
-+ * @delta_luma_weight_l1: the difference of the weighting factor applied
-+ *			  to the luma prediction value for list 1
-+ * @luma_offset_l1: the additive offset applied to the luma prediction value
-+ *		    for list 1
-+ * @delta_chroma_weight_l1: the difference of the weighting factor applied
-+ *			    to the chroma prediction values for list 1
-+ * @chroma_offset_l1: the difference of the additive offset applied to
-+ *		      the chroma prediction values for list 1
-+ * @luma_log2_weight_denom: the base 2 logarithm of the denominator for
-+ *			    all luma weighting factors
-+ * @delta_chroma_log2_weight_denom: the difference of the base 2 logarithm
-+ *				    of the denominator for all chroma
-+ *				    weighting factors
-+ */
-+struct v4l2_hevc_pred_weight_table {
-+	__s8	delta_luma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__s8	delta_luma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	luma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__s8	delta_chroma_weight_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+	__s8	chroma_offset_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX][2];
-+
-+	__u8	luma_log2_weight_denom;
-+	__s8	delta_chroma_log2_weight_denom;
-+};
-+
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_LUMA		(1ULL << 0)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_SAO_CHROMA		(1ULL << 1)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_TEMPORAL_MVP_ENABLED	(1ULL << 2)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_MVD_L1_ZERO			(1ULL << 3)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_CABAC_INIT			(1ULL << 4)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_COLLOCATED_FROM_L0		(1ULL << 5)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_USE_INTEGER_MV		(1ULL << 6)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_DEBLOCKING_FILTER_DISABLED (1ULL << 7)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_SLICE_LOOP_FILTER_ACROSS_SLICES_ENABLED (1ULL << 8)
-+#define V4L2_HEVC_SLICE_PARAMS_FLAG_DEPENDENT_SLICE_SEGMENT	(1ULL << 9)
-+
-+/**
-+ * struct v4l2_ctrl_hevc_slice_params - HEVC slice parameters
-+ *
-+ * This control is a dynamically sized 1-dimensional array,
-+ * V4L2_CTRL_FLAG_DYNAMIC_ARRAY flag must be set when using it.
-+ *
-+ * @bit_size: size (in bits) of the current slice data
-+ * @data_byte_offset: offset (in bytes) to the video data in the current slice data
-+ * @num_entry_point_offsets: specifies the number of entry point offset syntax
-+ *			     elements in the slice header.
-+ * @nal_unit_type: specifies the coding type of the slice (B, P or I)
-+ * @nuh_temporal_id_plus1: minus 1 specifies a temporal identifier for the NAL unit
-+ * @slice_type: see V4L2_HEVC_SLICE_TYPE_{}
-+ * @colour_plane_id: specifies the colour plane associated with the current slice
-+ * @slice_pic_order_cnt: specifies the picture order count
-+ * @num_ref_idx_l0_active_minus1: this value plus 1 specifies the maximum
-+ *                                reference index for reference picture list 0
-+ *                                that may be used to decode the slice
-+ * @num_ref_idx_l1_active_minus1: this value plus 1 specifies the maximum
-+ *                                reference index for reference picture list 1
-+ *                                that may be used to decode the slice
-+ * @collocated_ref_idx: specifies the reference index of the collocated picture used
-+ *			for temporal motion vector prediction
-+ * @five_minus_max_num_merge_cand: specifies the maximum number of merging
-+ *				   motion vector prediction candidates supported in
-+ *				   the slice subtracted from 5
-+ * @slice_qp_delta: specifies the initial value of QpY to be used for the coding
-+ *		    blocks in the slice
-+ * @slice_cb_qp_offset: specifies a difference to be added to the value of pps_cb_qp_offset
-+ * @slice_cr_qp_offset: specifies a difference to be added to the value of pps_cr_qp_offset
-+ * @slice_act_y_qp_offset: screen content extension parameters
-+ * @slice_act_cb_qp_offset: screen content extension parameters
-+ * @slice_act_cr_qp_offset: screen content extension parameters
-+ * @slice_beta_offset_div2: specify the deblocking parameter offsets for beta divided by 2
-+ * @slice_tc_offset_div2: specify the deblocking parameter offsets for tC divided by 2
-+ * @pic_struct: indicates whether a picture should be displayed as a frame or as one or
-+ *		more fields
-+ * @reserved0: padding field. Should be zeroed by applications.
-+ * @slice_segment_addr: specifies the address of the first coding tree block in
-+ *			the slice segment
-+ * @ref_idx_l0: the list of L0 reference elements as indices in the DPB
-+ * @ref_idx_l1: the list of L1 reference elements as indices in the DPB
-+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
-+ *				 pictures set included in the SPS
-+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
-+ *				pictures set include in the SPS
-+ * @pred_weight_table: the prediction weight coefficients for inter-picture
-+ *		       prediction
-+ * @reserved1: padding field. Should be zeroed by applications.
-+ * @flags: see V4L2_HEVC_SLICE_PARAMS_FLAG_{}
-+ */
-+struct v4l2_ctrl_hevc_slice_params {
-+	__u32	bit_size;
-+	__u32	data_byte_offset;
-+	__u32	num_entry_point_offsets;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: NAL unit header */
-+	__u8	nal_unit_type;
-+	__u8	nuh_temporal_id_plus1;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u8	slice_type;
-+	__u8	colour_plane_id;
-+	__s32	slice_pic_order_cnt;
-+	__u8	num_ref_idx_l0_active_minus1;
-+	__u8	num_ref_idx_l1_active_minus1;
-+	__u8	collocated_ref_idx;
-+	__u8	five_minus_max_num_merge_cand;
-+	__s8	slice_qp_delta;
-+	__s8	slice_cb_qp_offset;
-+	__s8	slice_cr_qp_offset;
-+	__s8	slice_act_y_qp_offset;
-+	__s8	slice_act_cb_qp_offset;
-+	__s8	slice_act_cr_qp_offset;
-+	__s8	slice_beta_offset_div2;
-+	__s8	slice_tc_offset_div2;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Picture timing SEI message */
-+	__u8	pic_struct;
-+
-+	__u8	reserved0[3];
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-+	__u32	slice_segment_addr;
-+	__u8	ref_idx_l0[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	ref_idx_l1[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u16	short_term_ref_pic_set_size;
-+	__u16	long_term_ref_pic_set_size;
-+
-+	/* ISO/IEC 23008-2, ITU-T Rec. H.265: Weighted prediction parameter */
-+	struct v4l2_hevc_pred_weight_table pred_weight_table;
-+
-+	__u8	reserved1[2];
-+	__u64	flags;
-+};
-+
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IRAP_PIC		0x1
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_IDR_PIC		0x2
-+#define V4L2_HEVC_DECODE_PARAM_FLAG_NO_OUTPUT_OF_PRIOR  0x4
-+
-+/**
-+ * struct v4l2_ctrl_hevc_decode_params - HEVC decode parameters
-+ *
-+ * @pic_order_cnt_val: picture order count
-+ * @short_term_ref_pic_set_size: specifies the size of short-term reference
-+ *				 pictures set included in the SPS of the first slice
-+ * @long_term_ref_pic_set_size: specifies the size of long-term reference
-+ *				pictures set include in the SPS of the first slice
-+ * @num_active_dpb_entries: the number of entries in dpb
-+ * @num_poc_st_curr_before: the number of reference pictures in the short-term
-+ *			    set that come before the current frame
-+ * @num_poc_st_curr_after: the number of reference pictures in the short-term
-+ *			   set that come after the current frame
-+ * @num_poc_lt_curr: the number of reference pictures in the long-term set
-+ * @poc_st_curr_before: provides the index of the short term before references
-+ *			in DPB array
-+ * @poc_st_curr_after: provides the index of the short term after references
-+ *		       in DPB array
-+ * @poc_lt_curr: provides the index of the long term references in DPB array
-+ * @reserved: padding field. Should be zeroed by applications.
-+ * @dpb: the decoded picture buffer, for meta-data about reference frames
-+ * @flags: see V4L2_HEVC_DECODE_PARAM_FLAG_{}
-+ */
-+struct v4l2_ctrl_hevc_decode_params {
-+	__s32	pic_order_cnt_val;
-+	__u16	short_term_ref_pic_set_size;
-+	__u16	long_term_ref_pic_set_size;
-+	__u8	num_active_dpb_entries;
-+	__u8	num_poc_st_curr_before;
-+	__u8	num_poc_st_curr_after;
-+	__u8	num_poc_lt_curr;
-+	__u8	poc_st_curr_before[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_st_curr_after[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	poc_lt_curr[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u8	reserved[4];
-+	struct	v4l2_hevc_dpb_entry dpb[V4L2_HEVC_DPB_ENTRIES_NUM_MAX];
-+	__u64	flags;
-+};
-+
-+/**
-+ * struct v4l2_ctrl_hevc_scaling_matrix - HEVC scaling lists parameters
-+ *
-+ * @scaling_list_4x4: scaling list is used for the scaling process for
-+ *		      transform coefficients. The values on each scaling
-+ *		      list are expected in raster scan order
-+ * @scaling_list_8x8: scaling list is used for the scaling process for
-+ *		      transform coefficients. The values on each scaling
-+ *		      list are expected in raster scan order
-+ * @scaling_list_16x16:	scaling list is used for the scaling process for
-+ *			transform coefficients. The values on each scaling
-+ *			list are expected in raster scan order
-+ * @scaling_list_32x32:	scaling list is used for the scaling process for
-+ *			transform coefficients. The values on each scaling
-+ *			list are expected in raster scan order
-+ * @scaling_list_dc_coef_16x16:	scaling list is used for the scaling process
-+ *				for transform coefficients. The values on each
-+ *				scaling list are expected in raster scan order.
-+ * @scaling_list_dc_coef_32x32:	scaling list is used for the scaling process
-+ *				for transform coefficients. The values on each
-+ *				scaling list are expected in raster scan order.
-+ */
-+struct v4l2_ctrl_hevc_scaling_matrix {
-+	__u8	scaling_list_4x4[6][16];
-+	__u8	scaling_list_8x8[6][64];
-+	__u8	scaling_list_16x16[6][64];
-+	__u8	scaling_list_32x32[2][64];
-+	__u8	scaling_list_dc_coef_16x16[6];
-+	__u8	scaling_list_dc_coef_32x32[2];
-+};
-+
-+#endif
-diff --git a/libavcodec/v4l2_req_hevc_v4.c b/libavcodec/v4l2_req_hevc_v4.c
-new file mode 100644
-index 0000000000..c35579d8e0
---- /dev/null
-+++ b/libavcodec/v4l2_req_hevc_v4.c
-@@ -0,0 +1,3 @@
-+#define HEVC_CTRLS_VERSION 4
-+#include "v4l2_req_hevc_vx.c"
-+
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 611fa21cc3..761c5b2dc7 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -6,8 +6,6 @@
- #include "internal.h"
- #include "thread.h"
- 
--#include "v4l2_request_hevc.h"
--
- #if HEVC_CTRLS_VERSION == 1
- #include "hevc-ctrls-v1.h"
- 
-@@ -18,10 +16,37 @@
- #include "hevc-ctrls-v2.h"
- #elif HEVC_CTRLS_VERSION == 3
- #include "hevc-ctrls-v3.h"
-+#elif HEVC_CTRLS_VERSION == 4
-+#include <linux/v4l2-controls.h>
-+#if !defined(V4L2_CID_STATELESS_HEVC_SPS)
-+#include "hevc-ctrls-v4.h"
-+#endif
- #else
- #error Unknown HEVC_CTRLS_VERSION
- #endif
- 
-+#ifndef V4L2_CID_STATELESS_HEVC_SPS
-+#define V4L2_CID_STATELESS_HEVC_SPS                     V4L2_CID_MPEG_VIDEO_HEVC_SPS
-+#define V4L2_CID_STATELESS_HEVC_PPS                     V4L2_CID_MPEG_VIDEO_HEVC_PPS
-+#define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS            V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS
-+#define V4L2_CID_STATELESS_HEVC_SCALING_MATRIX          V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX
-+#define V4L2_CID_STATELESS_HEVC_DECODE_PARAMS           V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS
-+#define V4L2_CID_STATELESS_HEVC_DECODE_MODE             V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE
-+#define V4L2_CID_STATELESS_HEVC_START_CODE              V4L2_CID_MPEG_VIDEO_HEVC_START_CODE
-+
-+#define V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED
-+#define V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED     V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED
-+#define V4L2_STATELESS_HEVC_START_CODE_NONE             V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE
-+#define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
-+#endif
-+
-+// Should be in videodev2 but we might not have a good enough one
-+#ifndef V4L2_PIX_FMT_HEVC_SLICE
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+#endif
-+
-+#include "v4l2_request_hevc.h"
-+
- #include "libavutil/hwcontext_drm.h"
- 
- #include <semaphore.h>
-@@ -259,9 +284,13 @@ fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const
- #endif
-             entry->field_pic = frame->frame->interlaced_frame;
- 
-+#if HEVC_CTRLS_VERSION <= 3
-             /* TODO: Interleaved: Get the POC for each field. */
-             entry->pic_order_cnt[0] = frame->poc;
-             entry->pic_order_cnt[1] = frame->poc;
-+#else
-+            entry->pic_order_cnt_val = frame->poc;
-+#endif
-         }
-     }
-     return n;
-@@ -287,8 +316,11 @@ static void fill_slice_params(const HEVCContext * const h,
- 
-     *slice_params = (struct v4l2_ctrl_hevc_slice_params) {
-         .bit_size = bit_size,
-+#if HEVC_CTRLS_VERSION <= 3
-         .data_bit_offset = bit_offset,
--
-+#else
-+        .data_byte_offset = bit_offset / 8 + 1,
-+#endif
-         /* ISO/IEC 23008-2, ITU-T Rec. H.265: General slice segment header */
-         .slice_segment_addr = sh->slice_segment_addr,
- 
-@@ -376,8 +408,10 @@ static void fill_slice_params(const HEVCContext * const h,
-         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-     }
- 
-+#if HEVC_CTRLS_VERSION <= 3
-     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
-+#endif
- }
- 
- #if HEVC_CTRLS_VERSION >= 2
-@@ -761,30 +795,30 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
- 
-     struct v4l2_ext_control control[] = {
-         {
--            .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS,
-+            .id = V4L2_CID_STATELESS_HEVC_SPS,
-             .ptr = &controls->sps,
-             .size = sizeof(controls->sps),
-         },
-         {
--            .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS,
-+            .id = V4L2_CID_STATELESS_HEVC_PPS,
-             .ptr = &controls->pps,
-             .size = sizeof(controls->pps),
-         },
- #if HEVC_CTRLS_VERSION >= 2
-         {
--            .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS,
-+            .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS,
-             .ptr = dec,
-             .size = sizeof(*dec),
-         },
- #endif
-         {
--            .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS,
-+            .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
-             .ptr = slices + slice_no,
-             .size = sizeof(*slices) * slice_count,
-         },
-         // Optional
-         {
--            .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX,
-+            .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
-             .ptr = &controls->scaling_matrix,
-             .size = sizeof(controls->scaling_matrix),
-         },
-@@ -1000,12 +1034,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
- 
-     // Check for var slice array
-     struct v4l2_query_ext_ctrl qc[] = {
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SPS },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_PPS },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SCALING_MATRIX },
-+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_STATELESS_HEVC_SPS },
-+        { .id = V4L2_CID_STATELESS_HEVC_PPS },
-+        { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
- #if HEVC_CTRLS_VERSION >= 2
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_PARAMS },
-+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_PARAMS },
- #endif
-     };
-     // Order & size must match!
-@@ -1042,12 +1076,13 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
- 
-     fill_sps(&ctrl_sps, sps);
- 
--    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_MPEG_VIDEO_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
-+    if (mediabufs_set_ext_ctrl(ctx->mbufs, NULL, V4L2_CID_STATELESS_HEVC_SPS, &ctrl_sps, sizeof(ctrl_sps))) {
-         av_log(avctx, AV_LOG_ERROR, "Failed to set initial SPS\n");
-         return AVERROR(EINVAL);
-     }
- 
-     ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
-+    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
-     return 0;
- }
- 
-@@ -1058,29 +1093,29 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     int ret;
- 
-     struct v4l2_query_ext_ctrl querys[] = {
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_SLICE_PARAMS, },
-+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
-+        { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
-     };
- 
-     struct v4l2_ext_control ctrls[] = {
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_DECODE_MODE, },
--        { .id = V4L2_CID_MPEG_VIDEO_HEVC_START_CODE, },
-+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
-+        { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
-     };
- 
-     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
- 
-     ctx->decode_mode = querys[0].default_value;
- 
--    if (ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_SLICE_BASED &&
--        ctx->decode_mode != V4L2_MPEG_VIDEO_HEVC_DECODE_MODE_FRAME_BASED) {
-+    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
-         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
-         return AVERROR(EINVAL);
-     }
- 
-     ctx->start_code = querys[1].default_value;
--    if (ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_NONE &&
--        ctx->start_code != V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
-+    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
-+        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
-         av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
-         return AVERROR(EINVAL);
-     }
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 20e4e0ab15..cd79aad563 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -210,7 +210,11 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-         goto fail4;
-     }
- 
--    if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-+    if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
-+        av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
-+        ctx->fns = &V2(ff_v4l2_req_hevc, 4);
-+    }
-+    else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
-     }
-diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
-index ed48d62e2d..d4adb3f812 100644
---- a/libavcodec/v4l2_request_hevc.h
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -99,5 +99,6 @@ typedef struct v4l2_req_decode_fns {
- extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 1);
- extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 2);
- extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 3);
-+extern const v4l2_req_decode_fns V2(ff_v4l2_req_hevc, 4);
- 
- #endif
-
-From 21a348ae3282318fa96d3a6e2c70f3d4b90a7d52 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Sun, 3 Jul 2022 13:40:41 +0000
-Subject: [PATCH 057/136] v4l2_req: Observe limit on size of slice_array
-
-This in fact provides some minor simplifications by combing the
-multi-slice and single-slice paths.
-
-(cherry picked from commit 7631e6d1a66fca9048605c214f3464c90d37932c)
----
- libavcodec/v4l2_req_hevc_vx.c  | 39 ++++++++++++++--------------------
- libavcodec/v4l2_request_hevc.h |  5 +----
- 2 files changed, 17 insertions(+), 27 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 761c5b2dc7..9d08d13d9e 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -840,18 +840,21 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
-     int bcount = get_bits_count(&h->HEVClc->gb);
-     uint32_t boff = (ptr_from_index(buffer, bcount/8 + 1) - (buffer + bcount/8 + 1)) * 8 + bcount;
- 
-+    const unsigned int n = rd->num_slices;
-+    const unsigned int block_start = (n / ctx->max_slices) * ctx->max_slices;
-+
-     int rv;
-     struct slice_info * si;
- 
-     if ((rv = slice_add(rd)) != 0)
-         return rv;
- 
--    si = rd->slices + rd->num_slices - 1;
-+    si = rd->slices + n;
-     si->ptr = buffer;
-     si->len = size;
- 
--    if (ctx->multi_slice && rd->num_slices > 1) {
--        struct slice_info *const si0 = rd->slices;
-+    if (n != block_start) {
-+        struct slice_info *const si0 = rd->slices + block_start;
-         const size_t offset = (buffer - si0->ptr);
-         boff += offset * 8;
-         size += offset;
-@@ -859,11 +862,11 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
-     }
- 
- #if HEVC_CTRLS_VERSION >= 2
--    if (rd->num_slices == 1)
-+    if (n == 0)
-         fill_decode_params(h, &rd->dec);
--    fill_slice_params(h, &rd->dec, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+    fill_slice_params(h, &rd->dec, rd->slice_params + n, size * 8, boff);
- #else
--    fill_slice_params(h, rd->slice_params + rd->num_slices - 1, size * 8, boff);
-+    fill_slice_params(h, rd->slice_params + n, size * 8, boff);
- #endif
- 
-     return 0;
-@@ -997,18 +1000,11 @@ static int v4l2_request_hevc_end_frame(AVCodecContext *avctx)
-     }
- 
-     // Send as slices
--    if (ctx->multi_slice)
--    {
--        if ((rv = send_slice(avctx, rd, &rc, 0, rd->num_slices)) != 0)
-+    for (i = 0; i < rd->num_slices; i += ctx->max_slices) {
-+        const unsigned int e = FFMIN(rd->num_slices, i + ctx->max_slices);
-+        if ((rv = send_slice(avctx, rd, &rc, i, e)) != 0)
-             goto fail;
-     }
--    else
--    {
--        for (i = 0; i != rd->num_slices; ++i) {
--            if ((rv = send_slice(avctx, rd, &rc, i, i + 1)) != 0)
--                goto fail;
--        }
--    }
- 
-     // Set the drm_prime desriptor
-     drm_from_format(&rd->drm, mediabufs_dst_fmt(ctx->mbufs));
-@@ -1081,8 +1077,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-         return AVERROR(EINVAL);
-     }
- 
--    ctx->multi_slice = (qc[0].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) != 0;
--    av_log(avctx, AV_LOG_INFO, "%s SPS muti-slice\n", ctx->multi_slice ? "Has" : "No");
-     return 0;
- }
- 
-@@ -1120,11 +1114,10 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-         return AVERROR(EINVAL);
-     }
- 
--    ctx->max_slices = querys[2].elems;
--    if (ctx->max_slices > MAX_SLICES) {
--        av_log(avctx, AV_LOG_ERROR, "%s: unsupported max slices, %d\n", __func__, ctx->max_slices);
--        return AVERROR(EINVAL);
--    }
-+    ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
-+                       querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
-+        1 : querys[2].dims[0];
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
- 
-     ctrls[0].value = ctx->decode_mode;
-     ctrls[1].value = ctx->start_code;
-diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
-index d4adb3f812..0029e23309 100644
---- a/libavcodec/v4l2_request_hevc.h
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -46,8 +46,6 @@
- #define V4L2_CTRL_FLAG_DYNAMIC_ARRAY	0x0800
- #endif
- 
--#define MAX_SLICES 128
--
- #define VCAT(name, version) name##_v##version
- #define V2(n,v) VCAT(n, v)
- #define V(n) V2(n, HEVC_CTRLS_VERSION)
-@@ -64,10 +62,9 @@ typedef struct V4L2RequestContextHEVC {
- 
-     unsigned int timestamp;  // ?? maybe uint64_t
- 
--    int multi_slice;
-     int decode_mode;
-     int start_code;
--    int max_slices;
-+    unsigned int max_slices;
- 
-     req_decode_q decode_q;
- 
-
-From 4f1d74cc8eea6a1bd6f2317a10c0ecf620315dec Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 4 Jul 2022 14:43:20 +0100
-Subject: [PATCH 058/136] v4l2_req: Add entry point offsets array control
-
----
- libavcodec/v4l2_req_hevc_vx.c  | 88 +++++++++++++++++++++++++++-------
- libavcodec/v4l2_request_hevc.h |  3 +-
- 2 files changed, 72 insertions(+), 19 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 9d08d13d9e..43ef6631ed 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -82,11 +82,16 @@ typedef struct V4L2MediaReqDescriptor {
-     struct v4l2_ctrl_hevc_slice_params * slice_params;
-     struct slice_info * slices;
- 
-+    size_t num_offsets;
-+    size_t alloced_offsets;
-+    uint32_t *offsets;
-+
- } V4L2MediaReqDescriptor;
- 
- struct slice_info {
-     const uint8_t * ptr;
-     size_t len; // bytes
-+    size_t n_offsets;
- };
- 
- // Handy container for accumulating controls before setting
-@@ -245,7 +250,7 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
-     if (rd->num_slices >= rd->alloced_slices) {
-         struct v4l2_ctrl_hevc_slice_params * p2;
-         struct slice_info * s2;
--        size_t n2 = rd->num_slices == 0 ? 8 : rd->num_slices * 2;
-+        size_t n2 = rd->alloced_slices == 0 ? 8 : rd->alloced_slices * 2;
- 
-         p2 = av_realloc_array(rd->slice_params, n2, sizeof(*p2));
-         if (p2 == NULL)
-@@ -263,6 +268,23 @@ static int slice_add(V4L2MediaReqDescriptor * const rd)
-     return 0;
- }
- 
-+static int offsets_add(V4L2MediaReqDescriptor *const rd, const size_t n, const unsigned * const offsets)
-+{
-+    if (rd->num_offsets + n > rd->alloced_offsets) {
-+        size_t n2 = rd->alloced_slices == 0 ? 128 : rd->alloced_slices * 2;
-+        void * p2;
-+        while (rd->num_offsets + n > n2)
-+            n2 *= 2;
-+        if ((p2 = av_realloc_array(rd->offsets, n2, sizeof(*rd->offsets))) == NULL)
-+            return AVERROR(ENOMEM);
-+        rd->offsets = p2;
-+        rd->alloced_offsets = n2;
-+    }
-+    for (size_t i = 0; i != n; ++i)
-+        rd->offsets[rd->num_offsets++] = offsets[i] - 1;
-+    return 0;
-+}
-+
- static unsigned int
- fill_dpb_entries(const HEVCContext * const h, struct v4l2_hevc_dpb_entry * const entries)
- {
-@@ -403,12 +425,12 @@ static void fill_slice_params(const HEVCContext * const h,
-     fill_pred_table(h, &slice_params->pred_weight_table);
- 
-     slice_params->num_entry_point_offsets = sh->num_entry_point_offsets;
-+#if HEVC_CTRLS_VERSION <= 3
-     if (slice_params->num_entry_point_offsets > 256) {
-         slice_params->num_entry_point_offsets = 256;
-         av_log(NULL, AV_LOG_ERROR, "%s: Currently only 256 entry points are supported, but slice has %d entry points.\n", __func__, sh->num_entry_point_offsets);
-     }
- 
--#if HEVC_CTRLS_VERSION <= 3
-     for (i = 0; i < slice_params->num_entry_point_offsets; i++)
-         slice_params->entry_point_offset_minus1[i] = sh->entry_point_offset[i] - 1;
- #endif
-@@ -787,13 +809,17 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
- #if HEVC_CTRLS_VERSION >= 2
-     struct v4l2_ctrl_hevc_decode_params * const dec,
- #endif
--    struct v4l2_ctrl_hevc_slice_params * const slices,
--    const unsigned int slice_no,
--    const unsigned int slice_count)
-+    struct v4l2_ctrl_hevc_slice_params * const slices, const unsigned int slice_count,
-+    void * const offsets, const size_t offset_count)
- {
-     int rv;
-+#if HEVC_CTRLS_VERSION >= 2
-+    unsigned int n = 4;
-+#else
-+    unsigned int n = 3;
-+#endif
- 
--    struct v4l2_ext_control control[] = {
-+    struct v4l2_ext_control control[6] = {
-         {
-             .id = V4L2_CID_STATELESS_HEVC_SPS,
-             .ptr = &controls->sps,
-@@ -813,21 +839,28 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
- #endif
-         {
-             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
--            .ptr = slices + slice_no,
-+            .ptr = slices,
-             .size = sizeof(*slices) * slice_count,
-         },
--        // Optional
--        {
-+    };
-+
-+    if (controls->has_scaling)
-+        control[n++] = (struct v4l2_ext_control) {
-             .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX,
-             .ptr = &controls->scaling_matrix,
-             .size = sizeof(controls->scaling_matrix),
--        },
--    };
-+        };
-+
-+#if HEVC_CTRLS_VERSION >= 4
-+    if (offsets)
-+        control[n++] = (struct v4l2_ext_control) {
-+            .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS,
-+            .ptr = offsets,
-+            .size = sizeof(((struct V4L2MediaReqDescriptor *)0)->offsets[0]) * offset_count,
-+        };
-+#endif
- 
--    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control,
--            controls->has_scaling ?
--                FF_ARRAY_ELEMS(control) :
--                FF_ARRAY_ELEMS(control) - 1);
-+    rv = mediabufs_ctl_set_ext_ctrls(ctx->mbufs, mreq, control, n);
- 
-     return rv;
- }
-@@ -852,6 +885,7 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
-     si = rd->slices + n;
-     si->ptr = buffer;
-     si->len = size;
-+    si->n_offsets = rd->num_offsets;
- 
-     if (n != block_start) {
-         struct slice_info *const si0 = rd->slices + block_start;
-@@ -868,6 +902,9 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
- #else
-     fill_slice_params(h, rd->slice_params + n, size * 8, boff);
- #endif
-+    if (ctx->max_offsets != 0 &&
-+        (rv = offsets_add(rd, h->sh.num_entry_point_offsets, h->sh.entry_point_offset)) != 0)
-+        return rv;
- 
-     return 0;
- }
-@@ -893,10 +930,13 @@ static int send_slice(AVCodecContext * const avctx,
- {
-     V4L2RequestContextHEVC * const ctx = avctx->internal->hwaccel_priv_data;
- 
-+    const int is_last = (j == rd->num_slices);
-     struct slice_info *const si = rd->slices + i;
-     struct media_request * req = NULL;
-     struct qent_src * src = NULL;
-     MediaBufsStatus stat;
-+    void * offsets = rd->offsets + rd->slices[i].n_offsets;
-+    size_t n_offsets = (is_last ? rd->num_offsets : rd->slices[j].n_offsets) - rd->slices[i].n_offsets;
- 
-     if ((req = media_request_get(ctx->mpool)) == NULL) {
-         av_log(avctx, AV_LOG_ERROR, "%s: Failed to alloc media request\n", __func__);
-@@ -908,8 +948,8 @@ static int send_slice(AVCodecContext * const avctx,
- #if HEVC_CTRLS_VERSION >= 2
-                      &rd->dec,
- #endif
--                     rd->slice_params,
--                     i, j - i)) {
-+                     rd->slice_params + i, j - i,
-+                     offsets, n_offsets)) {
-         av_log(avctx, AV_LOG_ERROR, "%s: Failed to set req ctls\n", __func__);
-         goto fail1;
-     }
-@@ -935,7 +975,7 @@ static int send_slice(AVCodecContext * const avctx,
- 
-     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
-                                    i == 0 ? rd->qe_dst : NULL,
--                                   j == rd->num_slices);
-+                                   is_last);
- 
-     if (stat != MEDIABUFS_STATUS_SUCCESS) {
-         av_log(avctx, AV_LOG_ERROR, "%s: Failed to start request\n", __func__);
-@@ -1090,6 +1130,9 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-         { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
-         { .id = V4L2_CID_STATELESS_HEVC_START_CODE, },
-         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS, },
-+#if HEVC_CTRLS_VERSION >= 4
-+        { .id = V4L2_CID_STATELESS_HEVC_ENTRY_POINT_OFFSETS, },
-+#endif
-     };
- 
-     struct v4l2_ext_control ctrls[] = {
-@@ -1119,6 +1162,14 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-         1 : querys[2].dims[0];
-     av_log(avctx, AV_LOG_DEBUG, "%s: Max slices %d\n", __func__, ctx->max_slices);
- 
-+#if HEVC_CTRLS_VERSION >= 4
-+    ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
-+        0 : querys[3].dims[0];
-+    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
-+#else
-+    ctx->max_offsets = 0;
-+#endif
-+
-     ctrls[0].value = ctx->decode_mode;
-     ctrls[1].value = ctx->start_code;
- 
-@@ -1141,6 +1192,7 @@ static void v4l2_req_frame_free(void *opaque, uint8_t *data)
- 
-     av_freep(&rd->slices);
-     av_freep(&rd->slice_params);
-+    av_freep(&rd->offsets);
- 
-     av_free(rd);
- }
-diff --git a/libavcodec/v4l2_request_hevc.h b/libavcodec/v4l2_request_hevc.h
-index 0029e23309..99c90064ea 100644
---- a/libavcodec/v4l2_request_hevc.h
-+++ b/libavcodec/v4l2_request_hevc.h
-@@ -64,7 +64,8 @@ typedef struct V4L2RequestContextHEVC {
- 
-     int decode_mode;
-     int start_code;
--    unsigned int max_slices;
-+    unsigned int max_slices;    // 0 => not wanted (frame mode)
-+    unsigned int max_offsets;   // 0 => not wanted
- 
-     req_decode_q decode_q;
- 
-
-From d0e5ed2dff1b8f8909ceb968cb3afe2b20093fda Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 4 Jul 2022 16:22:54 +0100
-Subject: [PATCH 059/136] v4l2_req: Support Annex B
-
----
- libavcodec/v4l2_req_hevc_vx.c | 61 +++++++++++++++++++++++------------
- 1 file changed, 41 insertions(+), 20 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 43ef6631ed..5e0db9850a 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -879,6 +879,18 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
-     int rv;
-     struct slice_info * si;
- 
-+    // This looks dodgy but we know that FFmpeg has parsed this from a buffer
-+    // that contains the entire frame including the start code
-+    if (ctx->start_code == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
-+        buffer -= 3;
-+        size += 3;
-+        boff += 24;
-+        if (buffer[0] != 0 || buffer[1] != 0 || buffer[2] != 1) {
-+            av_log(avctx, AV_LOG_ERROR, "Start code requested but missing %02x:%02x:%02x\n",
-+                   buffer[0], buffer[1], buffer[2]);
-+        }
-+    }
-+
-     if ((rv = slice_add(rd)) != 0)
-         return rv;
- 
-@@ -969,10 +981,6 @@ static int send_slice(AVCodecContext * const avctx,
-         goto fail2;
-     }
- 
--#warning ANNEX_B start code
--//        if (ctx->start_code == V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B) {
--//        }
--
-     stat = mediabufs_start_request(ctx->mbufs, &req, &src,
-                                    i == 0 ? rd->qe_dst : NULL,
-                                    is_last);
-@@ -1120,6 +1128,12 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     return 0;
- }
- 
-+static inline int
-+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
-+{
-+    return v >= c->minimum && v <= c->maximum;
-+}
-+
- // Final init
- static int
- set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-@@ -1142,21 +1156,6 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
- 
-     mediabufs_ctl_query_ext_ctrls(ctx->mbufs, querys, FF_ARRAY_ELEMS(querys));
- 
--    ctx->decode_mode = querys[0].default_value;
--
--    if (ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
--        ctx->decode_mode != V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
--        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode, %d\n", __func__, ctx->decode_mode);
--        return AVERROR(EINVAL);
--    }
--
--    ctx->start_code = querys[1].default_value;
--    if (ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_NONE &&
--        ctx->start_code != V4L2_STATELESS_HEVC_START_CODE_ANNEX_B) {
--        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code, %d\n", __func__, ctx->start_code);
--        return AVERROR(EINVAL);
--    }
--
-     ctx->max_slices = (!(querys[2].flags & V4L2_CTRL_FLAG_DYNAMIC_ARRAY) ||
-                        querys[2].nr_of_dims != 1 || querys[2].dims[0] == 0) ?
-         1 : querys[2].dims[0];
-@@ -1165,11 +1164,33 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
- #if HEVC_CTRLS_VERSION >= 4
-     ctx->max_offsets = (querys[3].type == 0 || querys[3].nr_of_dims != 1) ?
-         0 : querys[3].dims[0];
--    av_log(avctx, AV_LOG_INFO, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
-+    av_log(avctx, AV_LOG_DEBUG, "%s: Entry point offsets %d\n", __func__, ctx->max_offsets);
- #else
-     ctx->max_offsets = 0;
- #endif
- 
-+    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-+
-+    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-+    {
-+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
-+
-+        // Prefer NONE as it doesn't require the slightly dodgy look
-+        // backwards in our raw buffer
-+        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
-+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
-+        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
-+            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-+        else {
-+            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
-+            return AVERROR(EINVAL);
-+        }
-+    }
-+    else
-+    {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
-+    }
-+
-     ctrls[0].value = ctx->decode_mode;
-     ctrls[1].value = ctx->start_code;
- 
-
-From a75506e18a964c9f50efa224a3fa4179c9ef2127 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 4 Jul 2022 18:24:03 +0100
-Subject: [PATCH 060/136] v4l2_req: Add frame mode decode
-
----
- libavcodec/v4l2_req_hevc_vx.c | 69 +++++++++++++++++++++++------------
- 1 file changed, 46 insertions(+), 23 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 5e0db9850a..ada53d0d44 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -814,9 +814,9 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
- {
-     int rv;
- #if HEVC_CTRLS_VERSION >= 2
--    unsigned int n = 4;
--#else
-     unsigned int n = 3;
-+#else
-+    unsigned int n = 2;
- #endif
- 
-     struct v4l2_ext_control control[6] = {
-@@ -837,12 +837,14 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
-             .size = sizeof(*dec),
-         },
- #endif
--        {
-+    };
-+
-+    if (slices)
-+        control[n++] = (struct v4l2_ext_control) {
-             .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS,
-             .ptr = slices,
-             .size = sizeof(*slices) * slice_count,
--        },
--    };
-+        };
- 
-     if (controls->has_scaling)
-         control[n++] = (struct v4l2_ext_control) {
-@@ -865,6 +867,8 @@ set_req_ctls(V4L2RequestContextHEVC *ctx, struct media_request * const mreq,
-     return rv;
- }
- 
-+// This only works because we started out from a single coded frame buffer
-+// that will remain intact until after end_frame
- static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *buffer, uint32_t size)
- {
-     const HEVCContext * const h = avctx->priv_data;
-@@ -891,6 +895,17 @@ static int v4l2_request_hevc_decode_slice(AVCodecContext *avctx, const uint8_t *
-         }
-     }
- 
-+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED) {
-+        if (rd->slices == NULL) {
-+            if ((rd->slices = av_mallocz(sizeof(*rd->slices))) == NULL)
-+                return AVERROR(ENOMEM);
-+            rd->slices->ptr = buffer;
-+            rd->num_slices = 1;
-+        }
-+        rd->slices->len = buffer - rd->slices->ptr + size;
-+        return 0;
-+    }
-+
-     if ((rv = slice_add(rd)) != 0)
-         return rv;
- 
-@@ -1169,28 +1184,36 @@ set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     ctx->max_offsets = 0;
- #endif
- 
--    ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
--
--    if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
--    {
-+    if (querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED ||
-+        querys[0].default_value == V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED)
-+        ctx->decode_mode = querys[0].default_value;
-+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED))
-+        ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_FRAME_BASED;
-+    else if (ctrl_valid(querys + 0, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-         ctx->decode_mode = V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED;
--
--        // Prefer NONE as it doesn't require the slightly dodgy look
--        // backwards in our raw buffer
--        if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
--            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
--        else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
--            ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
--        else {
--            av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
--            return AVERROR(EINVAL);
--        }
--    }
--    else
--    {
-+    else {
-         av_log(avctx, AV_LOG_ERROR, "%s: unsupported decode mode\n", __func__);
-+        return AVERROR(EINVAL);
-     }
- 
-+    if (querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_NONE ||
-+        querys[1].default_value == V4L2_STATELESS_HEVC_START_CODE_ANNEX_B)
-+        ctx->start_code = querys[1].default_value;
-+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_ANNEX_B))
-+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_ANNEX_B;
-+    else if (ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
-+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
-+    else {
-+        av_log(avctx, AV_LOG_ERROR, "%s: unsupported start code\n", __func__);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    // If we are in slice mode & START_CODE_NONE supported then pick that
-+    // as it doesn't require the slightly dodgy look backwards in our raw buffer
-+    if (ctx->decode_mode == V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED &&
-+        ctrl_valid(querys + 1, V4L2_STATELESS_HEVC_START_CODE_NONE))
-+        ctx->start_code = V4L2_STATELESS_HEVC_START_CODE_NONE;
-+
-     ctrls[0].value = ctx->decode_mode;
-     ctrls[1].value = ctx->start_code;
- 
-
-From 9cf01f1485dcf71bcad7981d45029425d9abf115 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 5 Jul 2022 12:54:22 +0000
-Subject: [PATCH 061/136] v4l2_req: Fix probe for frame based decode
-
----
- libavcodec/v4l2_req_hevc_vx.c | 33 +++++++++++++++++++++++----------
- 1 file changed, 23 insertions(+), 10 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index ada53d0d44..5d083016f8 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -1082,6 +1082,12 @@ fail:
-     return rv;
- }
- 
-+static inline int
-+ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
-+{
-+    return v >= c->minimum && v <= c->maximum;
-+}
-+
- // Initial check & init
- static int
- probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-@@ -1094,6 +1100,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     // Check for var slice array
-     struct v4l2_query_ext_ctrl qc[] = {
-         { .id = V4L2_CID_STATELESS_HEVC_SLICE_PARAMS },
-+        { .id = V4L2_CID_STATELESS_HEVC_DECODE_MODE, },
-         { .id = V4L2_CID_STATELESS_HEVC_SPS },
-         { .id = V4L2_CID_STATELESS_HEVC_PPS },
-         { .id = V4L2_CID_STATELESS_HEVC_SCALING_MATRIX },
-@@ -1104,6 +1111,7 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     // Order & size must match!
-     static const size_t ctrl_sizes[] = {
-         sizeof(struct v4l2_ctrl_hevc_slice_params),
-+        sizeof(int32_t),
-         sizeof(struct v4l2_ctrl_hevc_sps),
-         sizeof(struct v4l2_ctrl_hevc_pps),
-         sizeof(struct v4l2_ctrl_hevc_scaling_matrix),
-@@ -1121,11 +1129,22 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-         return AVERROR(EINVAL);
- #endif
- 
--    if (mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls)) {
--        av_log(avctx, AV_LOG_DEBUG, "Probed V%d control missing\n", HEVC_CTRLS_VERSION);
-+    mediabufs_ctl_query_ext_ctrls(ctx->mbufs, qc, noof_ctrls);
-+    i = 0;
-+#if HEVC_CTRLS_VERSION >= 4
-+    // Skip slice check if no slice mode
-+    if (qc[1].type != 0 && !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-+        i = 1;
-+#else
-+    // Fail frame mode silently for anything prior to V4
-+    if (qc[1].type == 0 || !ctrl_valid(qc + 1, V4L2_STATELESS_HEVC_DECODE_MODE_SLICE_BASED))
-         return AVERROR(EINVAL);
--    }
--    for (i = 0; i != noof_ctrls; ++i) {
-+#endif
-+    for (; i != noof_ctrls; ++i) {
-+        if (qc[i].type == 0) {
-+            av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %#x missing\n", HEVC_CTRLS_VERSION, qc[i].id);
-+            return AVERROR(EINVAL);
-+        }
-         if (ctrl_sizes[i] != (size_t)qc[i].elem_size) {
-             av_log(avctx, AV_LOG_DEBUG, "Probed V%d control %d size mismatch %zu != %zu\n",
-                    HEVC_CTRLS_VERSION, i, ctrl_sizes[i], (size_t)qc[i].elem_size);
-@@ -1143,12 +1162,6 @@ probe(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-     return 0;
- }
- 
--static inline int
--ctrl_valid(const struct v4l2_query_ext_ctrl * const c, const int64_t v)
--{
--    return v >= c->minimum && v <= c->maximum;
--}
--
- // Final init
- static int
- set_controls(AVCodecContext * const avctx, V4L2RequestContextHEVC * const ctx)
-
-From e7a62226f26073149d35c89268f56e17c8f45d76 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 26 Jul 2022 15:46:14 +0000
-Subject: [PATCH 062/136] vf_deinterlace_v4l2m2m: Support NV12 through
- deinterlace
-
-Supports NV12 (though not yet NV12M) through deinterlace.
-Also improves error handling such that attempting to deinterlace an
-unsupported drm format causes an error.
-No longer leaks frame structures.
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 160 ++++++++++++++++++---------
- 1 file changed, 107 insertions(+), 53 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 1a933b7e0a..1a3bef5bcb 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -373,14 +373,16 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
- 		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
- 
-     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
--        if (fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
-+             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
-             fmt->fmt.pix_mp.field != field) {
-             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
- 
-             return AVERROR(EINVAL);
-         }
-     } else {
--        if (fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 ||
-+        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
-+             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
-             fmt->fmt.pix.field != field) {
-             av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
- 
-@@ -391,7 +393,7 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
-     return 0;
- }
- 
--static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width, int height, int pitch, int ysize)
-+static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
- {
-     struct v4l2_format *fmt        = &queue->format;
-     DeintV4L2M2MContextShared *ctx = queue->ctx;
-@@ -402,13 +404,16 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
-         .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
-     };
- 
-+    // This works for most single object 4:2:0 types
-     if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.pixelformat = pixelformat;
-         fmt->fmt.pix_mp.field = field;
-         fmt->fmt.pix_mp.width = width;
-         fmt->fmt.pix_mp.height = ysize / pitch;
-         fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
-         fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
-     } else {
-+        fmt->fmt.pix.pixelformat = pixelformat;
-         fmt->fmt.pix.field = field;
-         fmt->fmt.pix.width = width;
-         fmt->fmt.pix.height = height;
-@@ -417,12 +422,22 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
-     }
- 
-     ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
--    if (ret)
-+    if (ret) {
-+        ret = AVERROR(errno);
-         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
-+        return ret;
-+    }
-+
-+    if (pixelformat != fmt->fmt.pix.pixelformat) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
-+        return AVERROR(EINVAL);
-+    }
- 
-     ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
--    if (ret)
--        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_SELECTION failed: %d\n", ret);
-+    if (ret) {
-+        ret = AVERROR(errno);
-+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
-+    }
- 
-     sel.r.width = width;
-     sel.r.height = height;
-@@ -432,10 +447,12 @@ static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t field, int width,
-     sel.flags = V4L2_SEL_FLAG_LE;
- 
-     ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
--    if (ret)
--        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_SELECTION failed: %d\n", ret);
-+    if (ret) {
-+        ret = AVERROR(errno);
-+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
-+    }
- 
--    return ret;
-+    return 0;
- }
- 
- static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
-@@ -517,10 +534,25 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
-     return 0;
- }
- 
--static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-+static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
- {
-     struct v4l2_exportbuffer expbuf;
-     int i, ret;
-+    uint64_t mod = DRM_FORMAT_MOD_LINEAR;
-+    uint32_t fmt = 0;
-+
-+    switch (pixelformat) {
-+    case V4L2_PIX_FMT_NV12:
-+        fmt = DRM_FORMAT_NV12;
-+        break;
-+    case V4L2_PIX_FMT_YUV420:
-+        fmt = DRM_FORMAT_YUV420;
-+        break;
-+    default:
-+        return AVERROR(EINVAL);
-+    }
-+
-+    avbuf->drm_frame.layers[0].format = fmt;
- 
-     for (i = 0; i < avbuf->num_planes; i++) {
-         memset(&expbuf, 0, sizeof(expbuf));
-@@ -539,12 +571,12 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
-             /* drm frame */
-             avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
-             avbuf->drm_frame.objects[i].fd = expbuf.fd;
--            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+            avbuf->drm_frame.objects[i].format_modifier = mod;
-         } else {
-             /* drm frame */
-             avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
-             avbuf->drm_frame.objects[0].fd = expbuf.fd;
--            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+            avbuf->drm_frame.objects[0].format_modifier = mod;
-         }
-     }
- 
-@@ -629,7 +661,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-             if (ret)
-                 goto fail;
- 
--            ret = v4l2_buffer_export_drm(buf);
-+            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
-             if (ret)
-                 goto fail;
-         }
-@@ -878,7 +910,6 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
- 
- static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
- {
--    int av_pix_fmt = AV_PIX_FMT_YUV420P;
-     AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
-     AVDRMLayerDescriptor *layer;
- 
-@@ -895,20 +926,13 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-     }
- 
--    switch (av_pix_fmt) {
--    case AV_PIX_FMT_YUYV422:
--
--        layer->format = DRM_FORMAT_YUYV;
-+    switch (layer->format) {
-+    case DRM_FORMAT_YUYV:
-         layer->nb_planes = 1;
--
-         break;
- 
--    case AV_PIX_FMT_NV12:
--    case AV_PIX_FMT_NV21:
--
--        layer->format = av_pix_fmt == AV_PIX_FMT_NV12 ?
--            DRM_FORMAT_NV12 : DRM_FORMAT_NV21;
--
-+    case DRM_FORMAT_NV12:
-+    case DRM_FORMAT_NV21:
-         if (avbuf->num_planes > 1)
-             break;
- 
-@@ -920,10 +944,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
-         layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
-         break;
- 
--    case AV_PIX_FMT_YUV420P:
--
--        layer->format = DRM_FORMAT_YUV420;
--
-+    case DRM_FORMAT_YUV420:
-         if (avbuf->num_planes > 1)
-             break;
- 
-@@ -1032,6 +1053,26 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
-     return 0;
- }
- 
-+static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
-+{
-+    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
-+            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
-+
-+    switch (drm_desc->layers[0].format) {
-+    case DRM_FORMAT_YUV420:
-+        if (is_linear)
-+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
-+        break;
-+    case DRM_FORMAT_NV12:
-+        if (is_linear)
-+            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
-+        break;
-+    default:
-+        break;
-+    }
-+    return 0;
-+}
-+
- static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
- {
-     AVFilterContext *avctx         = link->dst;
-@@ -1047,23 +1088,27 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
- 
-     if (ctx->field_order == V4L2_FIELD_ANY) {
--        AVDRMFrameDescriptor *drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-+        const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
-+        const uint32_t pixelformat = desc_pixelformat(drm_desc);
-+
-+        if (pixelformat == 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
-+                   av_fourcc2str(drm_desc->layers[0].format),
-+                   drm_desc->nb_objects, drm_desc->objects[0].format_modifier);
-+            return AVERROR(EINVAL);
-+        }
-+
-         ctx->orig_width = drm_desc->layers[0].planes[0].pitch;
-         ctx->orig_height = drm_desc->layers[0].planes[1].offset / ctx->orig_width;
- 
-         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
-            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
- 
--        if (in->top_field_first)
--            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
--        else
--            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
--
--        ret = deint_v4l2m2m_set_format(output, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-         if (ret)
-             return ret;
- 
--        ret = deint_v4l2m2m_set_format(capture, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-+        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
-         if (ret)
-             return ret;
- 
-@@ -1082,6 +1127,12 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-         ret = deint_v4l2m2m_streamon(output);
-         if (ret)
-             return ret;
-+
-+        if (in->top_field_first)
-+            ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-+        else
-+            ctx->field_order = V4L2_FIELD_INTERLACED_BT;
-+
-     }
- 
-     ret = deint_v4l2m2m_enqueue_frame(output, in);
-@@ -1157,28 +1208,31 @@ again:
-         return 0;
-     }
- 
--    {
-+    recycle_q(&s->output);
-+    n = count_enqueued(&s->output);
-+
-+    while (n < 6) {
-         AVFrame * frame;
-         int rv;
- 
--        recycle_q(&s->output);
--        n = count_enqueued(&s->output);
-+        if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
-+            av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
-+            return rv;
-+        }
- 
--        while (n < 6) {
--            if ((rv = ff_inlink_consume_frame(inlink, &frame)) < 0) {
--                av_log(priv, AV_LOG_ERROR, "%s: consume in failed: %s\n", __func__, av_err2str(rv));
--                return rv;
--            }
-+        if (frame == NULL) {
-+            av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-+            break;
-+        }
- 
--            if (frame == NULL) {
--                av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
--                break;
--            }
-+        rv = deint_v4l2m2m_filter_frame(inlink, frame);
-+        av_frame_free(&frame);
- 
--            deint_v4l2m2m_filter_frame(inlink, frame);
--            av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
--            ++n;
--        }
-+        if (rv != 0)
-+            return rv;
-+
-+        av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
-+        ++n;
-     }
- 
-     if (n < 6) {
-
-From 3d07826bcf588ad0384d00b210415664aa4489fb Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 19 Aug 2022 15:29:11 +0000
-Subject: [PATCH 063/136] v4l2_req: Enable use of MMAP for buffer alloc
-
-Use MMAP rather than DMABUF if either the dmabuf device can't be opened
-or create_buf doesn't set the capability.
----
- libavcodec/v4l2_req_dmabufs.c  |  22 +++
- libavcodec/v4l2_req_dmabufs.h  |   3 +
- libavcodec/v4l2_req_media.c    | 263 ++++++++++++++++++++++++++++-----
- libavcodec/v4l2_req_media.h    |  21 ++-
- libavcodec/v4l2_request_hevc.c |  42 +++++-
- 5 files changed, 307 insertions(+), 44 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
-index ae6c648369..c4bbed18c6 100644
---- a/libavcodec/v4l2_req_dmabufs.c
-+++ b/libavcodec/v4l2_req_dmabufs.c
-@@ -36,6 +36,26 @@ static unsigned int total_bufs = 0;
- static size_t total_size = 0;
- #endif
- 
-+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size)
-+{
-+    struct dmabuf_h *dh;
-+
-+    if (mapptr == MAP_FAILED)
-+        return NULL;
-+
-+    dh = malloc(sizeof(*dh));
-+    if (!dh)
-+        return NULL;
-+
-+    *dh = (struct dmabuf_h) {
-+        .fd = -1,
-+        .size = size,
-+        .mapptr = mapptr
-+    };
-+
-+    return dh;
-+}
-+
- struct dmabuf_h * dmabuf_import(int fd, size_t size)
- {
-     struct dmabuf_h *dh;
-@@ -122,6 +142,8 @@ int dmabuf_sync(struct dmabuf_h * const dh, unsigned int flags)
-     struct dma_buf_sync sync = {
-         .flags = flags
-     };
-+    if (dh->fd == -1)
-+        return 0;
-     while (ioctl(dh->fd, DMA_BUF_IOCTL_SYNC, &sync) == -1) {
-         const int err = errno;
-         if (errno == EINTR)
-diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
-index cfb17e801d..c1d3d8c8d7 100644
---- a/libavcodec/v4l2_req_dmabufs.h
-+++ b/libavcodec/v4l2_req_dmabufs.h
-@@ -18,6 +18,9 @@ static inline struct dmabuf_h * dmabuf_alloc(struct dmabufs_ctl * dbsc, size_t s
- }
- /* Create from existing fd - dups(fd) */
- struct dmabuf_h * dmabuf_import(int fd, size_t size);
-+/* Import an MMAP - return NULL if mapptr = MAP_FAIL */
-+struct dmabuf_h * dmabuf_import_mmap(void * mapptr, size_t size);
-+
- void * dmabuf_map(struct dmabuf_h * const dh);
- 
- /* flags from linux/dmabuf.h DMA_BUF_SYNC_xxx */
-diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
-index 980b306b8a..910ac77bb6 100644
---- a/libavcodec/v4l2_req_media.c
-+++ b/libavcodec/v4l2_req_media.c
-@@ -33,9 +33,11 @@
- #include <string.h>
- #include <unistd.h>
- #include <linux/media.h>
-+#include <linux/mman.h>
- #include <sys/ioctl.h>
- #include <sys/select.h>
- #include <sys/ioctl.h>
-+#include <sys/mman.h>
- 
- #include <linux/videodev2.h>
- 
-@@ -95,6 +97,32 @@ struct media_request {
-     struct polltask * pt;
- };
- 
-+static inline enum v4l2_memory
-+mediabufs_memory_to_v4l2(const enum mediabufs_memory m)
-+{
-+    return (enum v4l2_memory)m;
-+}
-+
-+const char *
-+mediabufs_memory_name(const enum mediabufs_memory m)
-+{
-+    switch (m) {
-+    case MEDIABUFS_MEMORY_UNSET:
-+        return "Unset";
-+    case MEDIABUFS_MEMORY_MMAP:
-+        return "MMap";
-+    case MEDIABUFS_MEMORY_USERPTR:
-+        return "UserPtr";
-+    case MEDIABUFS_MEMORY_OVERLAY:
-+        return "Overlay";
-+    case MEDIABUFS_MEMORY_DMABUF:
-+        return "DMABuf";
-+    default:
-+        break;
-+    }
-+    return "Unknown";
-+}
-+
- 
- static inline int do_trywait(sem_t *const sem)
- {
-@@ -115,14 +143,14 @@ static inline int do_wait(sem_t *const sem)
- }
- 
- static int request_buffers(int video_fd, unsigned int type,
--                           enum v4l2_memory memory, unsigned int buffers_count)
-+                           enum mediabufs_memory memory, unsigned int buffers_count)
- {
-     struct v4l2_requestbuffers buffers;
-     int rc;
- 
-     memset(&buffers, 0, sizeof(buffers));
-     buffers.type = type;
--    buffers.memory = memory;
-+    buffers.memory = mediabufs_memory_to_v4l2(memory);
-     buffers.count = buffers_count;
- 
-     rc = ioctl(video_fd, VIDIOC_REQBUFS, &buffers);
-@@ -324,6 +352,7 @@ struct qent_base {
-     struct qent_base *next;
-     struct qent_base *prev;
-     enum qent_status status;
-+    enum mediabufs_memory memtype;
-     uint32_t index;
-     struct dmabuf_h *dh[VIDEO_MAX_PLANES];
-     struct timeval timestamp;
-@@ -348,9 +377,9 @@ struct qe_list_head {
- };
- 
- struct buf_pool {
-+    enum mediabufs_memory memtype;
-     pthread_mutex_t lock;
-     sem_t free_sem;
--    enum v4l2_buf_type buf_type;
-     struct qe_list_head free;
-     struct qe_list_head inuse;
- };
-@@ -367,9 +396,10 @@ static inline struct qent_src *base_to_src(struct qent_base *be)
- }
- 
- 
--#define QENT_BASE_INITIALIZER {\
-+#define QENT_BASE_INITIALIZER(mtype) {\
-     .ref_count = ATOMIC_VAR_INIT(0),\
-     .status = QENT_NEW,\
-+    .memtype = (mtype),\
-     .index  = INDEX_UNSET\
- }
- 
-@@ -390,13 +420,13 @@ static void qe_src_free(struct qent_src *const be_src)
-     free(be_src);
- }
- 
--static struct qent_src * qe_src_new(void)
-+static struct qent_src * qe_src_new(enum mediabufs_memory mtype)
- {
-     struct qent_src *const be_src = malloc(sizeof(*be_src));
-     if (!be_src)
-         return NULL;
-     *be_src = (struct qent_src){
--        .base = QENT_BASE_INITIALIZER
-+        .base = QENT_BASE_INITIALIZER(mtype)
-     };
-     return be_src;
- }
-@@ -413,13 +443,13 @@ static void qe_dst_free(struct qent_dst *const be_dst)
-     free(be_dst);
- }
- 
--static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl)
-+static struct qent_dst* qe_dst_new(struct ff_weak_link_master * const wl, const enum mediabufs_memory memtype)
- {
-     struct qent_dst *const be_dst = malloc(sizeof(*be_dst));
-     if (!be_dst)
-         return NULL;
-     *be_dst = (struct qent_dst){
--        .base = QENT_BASE_INITIALIZER,
-+        .base = QENT_BASE_INITIALIZER(memtype),
-         .lock = PTHREAD_MUTEX_INITIALIZER,
-         .cond = PTHREAD_COND_INITIALIZER,
-         .mbc_wl = ff_weak_link_ref(wl)
-@@ -553,14 +583,14 @@ static struct qent_base *queue_tryget_free(struct buf_pool *const bp)
-     return buf;
- }
- 
--static struct qent_base * queue_find_extract_fd(struct buf_pool *const bp, const int fd)
-+static struct qent_base * queue_find_extract_index(struct buf_pool *const bp, const unsigned int index)
- {
-     struct qent_base *be;
- 
-     pthread_mutex_lock(&bp->lock);
-     /* Expect 1st in Q, but allow anywhere */
-     for (be = bp->inuse.head; be; be = be->next) {
--        if (dmabuf_fd(be->dh[0]) == fd) {
-+        if (be->index == index) {
-             bq_extract_inuse(bp, be);
-             break;
-         }
-@@ -602,6 +632,8 @@ struct mediabufs_ctl {
-     struct pollqueue * pq;
-     struct ff_weak_link_master * this_wlm;
- 
-+    enum mediabufs_memory src_memtype;
-+    enum mediabufs_memory dst_memtype;
-     struct v4l2_format src_fmt;
-     struct v4l2_format dst_fmt;
-     struct v4l2_capability capability;
-@@ -614,7 +646,7 @@ static int qe_v4l2_queue(struct qent_base *const be,
- {
-     struct v4l2_buffer buffer = {
-         .type = fmt->type,
--        .memory = V4L2_MEMORY_DMABUF,
-+        .memory = mediabufs_memory_to_v4l2(be->memtype),
-         .index = be->index
-     };
-     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-@@ -628,7 +660,10 @@ static int qe_v4l2_queue(struct qent_base *const be,
-             /* *** Really need a pixdesc rather than a format so we can fill in data_offset */
-             planes[i].length = dmabuf_size(be->dh[i]);
-             planes[i].bytesused = dmabuf_len(be->dh[i]);
--            planes[i].m.fd = dmabuf_fd(be->dh[i]);
-+            if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
-+                planes[i].m.fd = dmabuf_fd(be->dh[i]);
-+            else
-+                planes[i].m.mem_offset = 0;
-         }
-         buffer.m.planes = planes;
-         buffer.length = i;
-@@ -639,7 +674,10 @@ static int qe_v4l2_queue(struct qent_base *const be,
- 
-         buffer.bytesused = dmabuf_len(be->dh[0]);
-         buffer.length = dmabuf_size(be->dh[0]);
--        buffer.m.fd = dmabuf_fd(be->dh[0]);
-+        if (be->memtype == MEDIABUFS_MEMORY_DMABUF)
-+            buffer.m.fd = dmabuf_fd(be->dh[0]);
-+        else
-+            buffer.m.offset = 0;
-     }
- 
-     if (!is_dst && mreq) {
-@@ -668,14 +706,13 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
-                      const int vfd,
-                      const struct v4l2_format * const f)
- {
--    int fd;
-     struct qent_base *be;
-     int rc;
-     const bool mp = V4L2_TYPE_IS_MULTIPLANAR(f->type);
-     struct v4l2_plane planes[VIDEO_MAX_PLANES] = {{0}};
-     struct v4l2_buffer buffer = {
-         .type =  f->type,
--        .memory = V4L2_MEMORY_DMABUF
-+        .memory = mediabufs_memory_to_v4l2(bp->memtype)
-     };
-     if (mp) {
-         buffer.length = f->fmt.pix_mp.num_planes;
-@@ -690,10 +727,9 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
-         return NULL;
-     }
- 
--    fd = mp ? planes[0].m.fd : buffer.m.fd;
--    be = queue_find_extract_fd(bp, fd);
-+    be = queue_find_extract_index(bp, buffer.index);
-     if (!be) {
--        request_log("Failed to find fd %d in Q\n", fd);
-+        request_log("Failed to find index %d in Q\n", buffer.index);
-         return NULL;
-     }
- 
-@@ -1104,7 +1140,7 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru
- 
-     struct v4l2_create_buffers cbuf = {
-         .count = n,
--        .memory = V4L2_MEMORY_DMABUF,
-+        .memory = mediabufs_memory_to_v4l2(mbc->dst->memtype),
-         .format = mbc->dst_fmt,
-     };
- 
-@@ -1125,12 +1161,97 @@ static int create_dst_bufs(struct mediabufs_ctl *const mbc, unsigned int n, stru
-     return cbuf.count;
- }
- 
-+static MediaBufsStatus
-+qe_import_from_buf(struct mediabufs_ctl *const mbc, struct qent_base * const be, const struct v4l2_format *const fmt,
-+                   const unsigned int n, const bool x_dmabuf)
-+{
-+    struct v4l2_buffer buf = {
-+        .index = n,
-+        .type = fmt->type,
-+    };
-+    struct v4l2_plane planes[VIDEO_MAX_PLANES];
-+    int ret;
-+
-+    if (be->dh[0])
-+        return 0;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        memset(planes, 0, sizeof(planes));
-+        buf.m.planes = planes;
-+        buf.length = VIDEO_MAX_PLANES;
-+    }
-+
-+    if ((ret = ioctl(mbc->vfd, VIDIOC_QUERYBUF, &buf)) != 0) {
-+        request_err(mbc->dc, "VIDIOC_QUERYBUF failed");
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type))
-+    {
-+        unsigned int i;
-+        for (i = 0; i != buf.length; ++i) {
-+            if (x_dmabuf) {
-+                struct v4l2_exportbuffer xbuf = {
-+                    .type = buf.type,
-+                    .index = buf.index,
-+                    .plane = i,
-+                    .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
-+                };
-+                if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
-+                    be->dh[i] = dmabuf_import(xbuf.fd, planes[i].length);
-+            }
-+            else {
-+                be->dh[i] = dmabuf_import_mmap(
-+                    mmap(NULL, planes[i].length,
-+                        PROT_READ | PROT_WRITE,
-+                        MAP_SHARED | MAP_POPULATE,
-+                        mbc->vfd, planes[i].m.mem_offset),
-+                    planes[i].length);
-+            }
-+            /* On failure tidy up and die */
-+            if (!be->dh[i]) {
-+                while (i--) {
-+                    dmabuf_free(be->dh[i]);
-+                    be->dh[i] = NULL;
-+                }
-+                return MEDIABUFS_ERROR_OPERATION_FAILED;
-+            }
-+        }
-+    }
-+    else
-+    {
-+        if (x_dmabuf) {
-+            struct v4l2_exportbuffer xbuf = {
-+                .type = buf.type,
-+                .index = buf.index,
-+                .flags = O_RDWR, // *** Arguably O_RDONLY would be fine
-+            };
-+            if (ioctl(mbc->vfd, VIDIOC_EXPBUF, &xbuf) == 0)
-+                be->dh[0] = dmabuf_import(xbuf.fd, buf.length);
-+        }
-+        else {
-+            be->dh[0] = dmabuf_import_mmap(
-+                mmap(NULL, buf.length,
-+                    PROT_READ | PROT_WRITE,
-+                    MAP_SHARED | MAP_POPULATE,
-+                    mbc->vfd, buf.m.offset),
-+                buf.length);
-+        }
-+        /* On failure tidy up and die */
-+        if (!be->dh[0]) {
-+            return MEDIABUFS_ERROR_OPERATION_FAILED;
-+        }
-+    }
-+
-+    return 0;
-+}
-+
- struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struct dmabufs_ctl *const dbsc)
- {
-     struct qent_dst * be_dst;
- 
-     if (mbc == NULL) {
--        be_dst = qe_dst_new(NULL);
-+        be_dst = qe_dst_new(NULL, MEDIABUFS_MEMORY_DMABUF);
-         if (be_dst)
-             be_dst->base.status = QENT_IMPORT;
-         return be_dst;
-@@ -1144,7 +1265,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
-     else {
-         be_dst = base_to_dst(queue_tryget_free(mbc->dst));
-         if (!be_dst) {
--            be_dst = qe_dst_new(mbc->this_wlm);
-+            be_dst = qe_dst_new(mbc->this_wlm, mbc->dst->memtype);
-             if (!be_dst)
-                 return NULL;
- 
-@@ -1155,12 +1276,21 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc, struc
-         }
-     }
- 
--    if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
--        /* Given  how create buf works we can't uncreate it on alloc failure
--         * all we can do is put it on the free Q
--        */
--        queue_put_free(mbc->dst, &be_dst->base);
--        return NULL;
-+    if (mbc->dst->memtype == MEDIABUFS_MEMORY_MMAP) {
-+        if (qe_import_from_buf(mbc, &be_dst->base, &mbc->dst_fmt, be_dst->base.index, true)) {
-+            request_err(mbc->dc, "Failed to export as dmabuf\n");
-+            queue_put_free(mbc->dst, &be_dst->base);
-+            return NULL;
-+        }
-+    }
-+    else {
-+        if (qe_alloc_from_fmt(&be_dst->base, dbsc, &mbc->dst_fmt)) {
-+            /* Given  how create buf works we can't uncreate it on alloc failure
-+             * all we can do is put it on the free Q
-+            */
-+            queue_put_free(mbc->dst, &be_dst->base);
-+            return NULL;
-+        }
-     }
- 
-     be_dst->base.status = QENT_PENDING;
-@@ -1208,7 +1338,7 @@ MediaBufsStatus mediabufs_dst_fmt_set(struct mediabufs_ctl *const mbc,
- 
- // ** This is a mess if we get partial alloc but without any way to remove
- //    individual V4L2 Q members we are somewhat stuffed
--MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed)
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype)
- {
-     unsigned int i;
-     int a = 0;
-@@ -1218,10 +1348,12 @@ MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, cons
-     if (n > 32)
-         return MEDIABUFS_ERROR_ALLOCATION_FAILED;
- 
-+    mbc->dst->memtype = memtype;
-+
-     // Create qents first as it is hard to get rid of the V4L2 buffers on error
-     for (qc = 0; qc != n; ++qc)
-     {
--        if ((qes[qc] = qe_dst_new(mbc->this_wlm)) == NULL)
-+        if ((qes[qc] = qe_dst_new(mbc->this_wlm, mbc->dst->memtype)) == NULL)
-             goto fail;
-     }
- 
-@@ -1260,19 +1392,61 @@ void mediabufs_src_qent_abort(struct mediabufs_ctl *const mbc, struct qent_src *
-     queue_put_free(mbc->src, &qe_src->base);
- }
- 
-+static MediaBufsStatus
-+chk_memory_type(struct mediabufs_ctl *const mbc,
-+    const struct v4l2_format * const f,
-+    const enum mediabufs_memory m)
-+{
-+    struct v4l2_create_buffers cbuf = {
-+        .count = 0,
-+        .memory = V4L2_MEMORY_MMAP,
-+        .format = *f
-+    };
-+
-+    if (ioctl(mbc->vfd, VIDIOC_CREATE_BUFS, &cbuf) != 0)
-+        return MEDIABUFS_ERROR_OPERATION_FAILED;
-+
-+    switch (m) {
-+    case MEDIABUFS_MEMORY_DMABUF:
-+        // 0 = Unknown but assume not in that case
-+        if ((cbuf.capabilities & V4L2_BUF_CAP_SUPPORTS_DMABUF) == 0)
-+            return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
-+        break;
-+    case MEDIABUFS_MEMORY_MMAP:
-+        break;
-+    default:
-+        return MEDIABUFS_ERROR_UNSUPPORTED_MEMORY;
-+    }
-+
-+    return MEDIABUFS_STATUS_SUCCESS;
-+}
-+
-+MediaBufsStatus
-+mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
-+{
-+    return chk_memory_type(mbc, &mbc->src_fmt, memtype);
-+}
-+
-+MediaBufsStatus
-+mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype)
-+{
-+    return chk_memory_type(mbc, &mbc->dst_fmt, memtype);
-+}
-+
- /* src format must have been set up before this */
- MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
-                   struct dmabufs_ctl * const dbsc,
--                  unsigned int n)
-+                  unsigned int n, const enum mediabufs_memory memtype)
- {
-     unsigned int i;
-     struct v4l2_requestbuffers req = {
-         .count = n,
-         .type = mbc->src_fmt.type,
--        .memory = V4L2_MEMORY_DMABUF
-+        .memory = mediabufs_memory_to_v4l2(memtype)
-     };
- 
-     bq_free_all_free_src(mbc->src);
-+
-     while (ioctl(mbc->vfd, VIDIOC_REQBUFS, &req) == -1) {
-         if (errno != EINTR) {
-             request_err(mbc->dc, "%s: Failed to request src bufs\n", __func__);
-@@ -1286,21 +1460,36 @@ MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const mbc,
-     }
- 
-     for (i = 0; i != n; ++i) {
--        struct qent_src *const be_src = qe_src_new();
-+        struct qent_src *const be_src = qe_src_new(memtype);
-         if (!be_src) {
-             request_err(mbc->dc, "Failed to create src be %d\n", i);
-             goto fail;
-         }
--        if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
--            qe_src_free(be_src);
-+        switch (memtype) {
-+        case MEDIABUFS_MEMORY_MMAP:
-+            if (qe_import_from_buf(mbc, &be_src->base, &mbc->src_fmt, i, false)) {
-+                qe_src_free(be_src);
-+                goto fail;
-+            }
-+            be_src->fixed_size = 1;
-+            break;
-+        case MEDIABUFS_MEMORY_DMABUF:
-+            if (qe_alloc_from_fmt(&be_src->base, dbsc, &mbc->src_fmt)) {
-+                qe_src_free(be_src);
-+                goto fail;
-+            }
-+            be_src->fixed_size = !mediabufs_src_resizable(mbc);
-+            break;
-+        default:
-+            request_err(mbc->dc, "Unexpected memorty type\n");
-             goto fail;
-         }
-         be_src->base.index = i;
--        be_src->fixed_size = !mediabufs_src_resizable(mbc);
- 
-         queue_put_free(mbc->src, &be_src->base);
-     }
- 
-+    mbc->src->memtype = memtype;
-     return MEDIABUFS_STATUS_SUCCESS;
- 
- fail:
-@@ -1437,9 +1626,13 @@ int mediabufs_ctl_query_ext_ctrls(struct mediabufs_ctl * mbc, struct v4l2_query_
- 
- int mediabufs_src_resizable(const struct mediabufs_ctl *const mbc)
- {
-+#if 1
-+    return 0;
-+#else
-     // Single planar OUTPUT can only take exact size buffers
-     // Multiplanar will take larger than negotiated
-     return V4L2_TYPE_IS_MULTIPLANAR(mbc->src_fmt.type);
-+#endif
- }
- 
- static void mediabufs_ctl_delete(struct mediabufs_ctl *const mbc)
-diff --git a/libavcodec/v4l2_req_media.h b/libavcodec/v4l2_req_media.h
-index 0307a831de..890947b2e2 100644
---- a/libavcodec/v4l2_req_media.h
-+++ b/libavcodec/v4l2_req_media.h
-@@ -43,6 +43,7 @@ typedef enum media_buf_status {
-     MEDIABUFS_ERROR_UNSUPPORTED_BUFFERTYPE,
-     MEDIABUFS_ERROR_UNSUPPORTED_RT_FORMAT,
-     MEDIABUFS_ERROR_ALLOCATION_FAILED,
-+    MEDIABUFS_ERROR_UNSUPPORTED_MEMORY,
- } MediaBufsStatus;
- 
- struct media_pool * media_pool_new(const char * const media_path,
-@@ -70,6 +71,15 @@ struct qent_dst;
- struct dmabuf_h;
- struct dmabufs_ctl;
- 
-+// 1-1 mammping to V4L2 type - just defined separetely to avoid some include versioning difficulties
-+enum mediabufs_memory {
-+   MEDIABUFS_MEMORY_UNSET            = 0,
-+   MEDIABUFS_MEMORY_MMAP             = 1,
-+   MEDIABUFS_MEMORY_USERPTR          = 2,
-+   MEDIABUFS_MEMORY_OVERLAY          = 3,
-+   MEDIABUFS_MEMORY_DMABUF           = 4,
-+};
-+
- int qent_src_params_set(struct qent_src *const be, const struct timeval * timestamp);
- struct timeval qent_dst_timestamp_get(const struct qent_dst *const be_dst);
- 
-@@ -93,6 +103,8 @@ MediaBufsStatus qent_dst_import_fd(struct qent_dst *const be_dst,
-                 unsigned int plane,
-                 int fd, size_t size);
- 
-+const char * mediabufs_memory_name(const enum mediabufs_memory m);
-+
- MediaBufsStatus mediabufs_start_request(struct mediabufs_ctl *const mbc,
-                 struct media_request **const pmreq,
-                 struct qent_src **const psrc_be,
-@@ -106,7 +118,7 @@ struct qent_dst* mediabufs_dst_qent_alloc(struct mediabufs_ctl *const mbc,
- // Create dst slots without alloc
- // If fixed true then qent_alloc will only get slots from this pool and will
- // block until a qent has been unrefed
--MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed);
-+MediaBufsStatus mediabufs_dst_slots_create(struct mediabufs_ctl *const mbc, const unsigned int n, const bool fixed, const enum mediabufs_memory memtype);
- 
- MediaBufsStatus mediabufs_stream_on(struct mediabufs_ctl *const mbc);
- MediaBufsStatus mediabufs_stream_off(struct mediabufs_ctl *const mbc);
-@@ -140,7 +152,12 @@ MediaBufsStatus mediabufs_src_fmt_set(struct mediabufs_ctl *const mbc,
- 
- MediaBufsStatus mediabufs_src_pool_create(struct mediabufs_ctl *const rw,
-                   struct dmabufs_ctl * const dbsc,
--                  unsigned int n);
-+                  unsigned int n,
-+                  const enum mediabufs_memory memtype);
-+
-+// Want to have appropriate formats set first
-+MediaBufsStatus mediabufs_src_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
-+MediaBufsStatus mediabufs_dst_chk_memtype(struct mediabufs_ctl *const mbc, const enum mediabufs_memory memtype);
- 
- #define MEDIABUFS_DRIVER_VERSION(a, b, c) (((a) << 16) | ((b) << 8) | (c))
- unsigned int mediabufs_ctl_driver_version(struct mediabufs_ctl *const mbc);
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index cd79aad563..5cf17dd5e3 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -144,6 +144,8 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-     const struct decdev * decdev;
-     const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
-     size_t src_size;
-+    enum mediabufs_memory src_memtype;
-+    enum mediabufs_memory dst_memtype;
- 
-     av_log(avctx, AV_LOG_DEBUG, "<<< %s\n", __func__);
- 
-@@ -174,8 +176,14 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-            decdev_media_path(decdev), decdev_video_path(decdev));
- 
-     if ((ctx->dbufs = dmabufs_ctl_new()) == NULL) {
--        av_log(avctx, AV_LOG_ERROR, "Unable to open dmabufs\n");
--        goto fail0;
-+        av_log(avctx, AV_LOG_DEBUG, "Unable to open dmabufs - try mmap buffers\n");
-+        src_memtype = MEDIABUFS_MEMORY_MMAP;
-+        dst_memtype = MEDIABUFS_MEMORY_MMAP;
-+    }
-+    else {
-+        av_log(avctx, AV_LOG_DEBUG, "Dmabufs opened - try dmabuf buffers\n");
-+        src_memtype = MEDIABUFS_MEMORY_DMABUF;
-+        dst_memtype = MEDIABUFS_MEMORY_DMABUF;
-     }
- 
-     if ((ctx->pq = pollqueue_new()) == NULL) {
-@@ -196,8 +204,9 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-     // Ask for an initial bitbuf size of max size / 4
-     // We will realloc if we need more
-     // Must use sps->h/w as avctx contains cropped size
-+retry_src_memtype:
-     src_size = bit_buf_size(sps->width, sps->height, sps->bit_depth - 8);
--    if (mediabufs_src_resizable(ctx->mbufs))
-+    if (src_memtype == MEDIABUFS_MEMORY_DMABUF && mediabufs_src_resizable(ctx->mbufs))
-         src_size /= 4;
-     // Kludge for conformance tests which break Annex A limits
-     else if (src_size < 0x40000)
-@@ -210,6 +219,15 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-         goto fail4;
-     }
- 
-+    if (mediabufs_src_chk_memtype(ctx->mbufs, src_memtype)) {
-+        if (src_memtype == MEDIABUFS_MEMORY_DMABUF) {
-+            src_memtype = MEDIABUFS_MEMORY_MMAP;
-+            goto retry_src_memtype;
-+        }
-+        av_log(avctx, AV_LOG_ERROR, "Failed to get src memory type\n");
-+        goto fail4;
-+    }
-+
-     if (V2(ff_v4l2_req_hevc, 4).probe(avctx, ctx) == 0) {
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
-@@ -238,7 +256,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-         goto fail4;
-     }
- 
--    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6)) {
-+    if (mediabufs_src_pool_create(ctx->mbufs, ctx->dbufs, 6, src_memtype)) {
-         av_log(avctx, AV_LOG_ERROR, "Failed to create source pool\n");
-         goto fail4;
-     }
-@@ -250,8 +268,17 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-                sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering,
-                avctx->thread_count, avctx->extra_hw_frames);
- 
-+        if (mediabufs_dst_chk_memtype(ctx->mbufs, dst_memtype)) {
-+            if (dst_memtype != MEDIABUFS_MEMORY_DMABUF) {
-+                av_log(avctx, AV_LOG_ERROR, "Failed to get dst memory type\n");
-+                goto fail4;
-+            }
-+            av_log(avctx, AV_LOG_DEBUG, "Dst DMABUF not supported - trying mmap\n");
-+            dst_memtype = MEDIABUFS_MEMORY_MMAP;
-+        }
-+
-         // extra_hw_frames is -1 if unset
--        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0))) {
-+        if (mediabufs_dst_slots_create(ctx->mbufs, dst_slots, (avctx->extra_hw_frames > 0), dst_memtype)) {
-             av_log(avctx, AV_LOG_ERROR, "Failed to create destination slots\n");
-             goto fail4;
-         }
-@@ -277,9 +304,10 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-     // Set our s/w format
-     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
- 
--    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s\n",
-+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
-            ctx->fns->name,
--           decdev_media_path(decdev), decdev_video_path(decdev));
-+           decdev_media_path(decdev), decdev_video_path(decdev),
-+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));
- 
-     return 0;
- 
-
-From 79c2fcac56586ce9eea0cc8c6b13d2cd54f3e468 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 22 Aug 2022 12:35:40 +0000
-Subject: [PATCH 064/136] Set buffer lengths on DQ
-
----
- libavcodec/v4l2_req_media.c | 8 ++++++++
- 1 file changed, 8 insertions(+)
-
-diff --git a/libavcodec/v4l2_req_media.c b/libavcodec/v4l2_req_media.c
-index 910ac77bb6..1a9944774a 100644
---- a/libavcodec/v4l2_req_media.c
-+++ b/libavcodec/v4l2_req_media.c
-@@ -733,6 +733,14 @@ static struct qent_base * qe_dequeue(struct buf_pool *const bp,
-         return NULL;
-     }
- 
-+    if (mp) {
-+        unsigned int i;
-+        for (i = 0; i != buffer.length; ++i)
-+            dmabuf_len_set(be->dh[i], V4L2_TYPE_IS_CAPTURE(f->type) ? planes[i].bytesused : 0);
-+    }
-+    else
-+        dmabuf_len_set(be->dh[0], V4L2_TYPE_IS_CAPTURE(f->type) ? buffer.length : 0);
-+
-     be->timestamp = buffer.timestamp;
-     be->status = (buffer.flags & V4L2_BUF_FLAG_ERROR) ? QENT_ERROR : QENT_DONE;
-     return be;
-
-From 8f3245ca1e4b2ec7e13fc2f3bffbc964ee8fc290 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 22 Aug 2022 17:11:24 +0000
-Subject: [PATCH 065/136] Fix compile if videodev2.h defines V4L2 HEVC request
- API
-
-If videodev2.h does define the HEVC request API it is really hard to
-set old variations of the controls so if it does then we only compile
-against the system includes and remove the back compatability.
----
- configure                      | 9 +++++++++
- libavcodec/Makefile            | 4 ++--
- libavcodec/hevc-ctrls-v4.h     | 2 ++
- libavcodec/v4l2_req_hevc_vx.c  | 5 -----
- libavcodec/v4l2_request_hevc.c | 6 ++++--
- 5 files changed, 17 insertions(+), 9 deletions(-)
-
-diff --git a/configure b/configure
-index fdc95146bf..5c00a183e3 100755
---- a/configure
-+++ b/configure
-@@ -1946,6 +1946,7 @@ FEATURE_LIST="
-     swscale_alpha
-     vout_drm
-     vout_egl
-+    v4l2_req_hevc_vx
- "
- 
- # this list should be kept in linking order
-@@ -6912,6 +6913,14 @@ fi
- 
- check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
- check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
-+disable v4l2_req_hevc_vx
-+if enabled hevc_v4l2request_hwaccel; then
-+    enable v4l2_req_hevc_vx
-+fi
-+if enabled hevc_v4l2_request; then
-+    disable v4l2_req_hevc_vx
-+fi
-+
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
- 
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index d433a71236..11f183c9b9 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -999,8 +999,8 @@ OBJS-$(CONFIG_HEVC_D3D11VA_HWACCEL)       += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_DXVA2_HWACCEL)         += dxva2_hevc.o
- OBJS-$(CONFIG_HEVC_NVDEC_HWACCEL)         += nvdec_hevc.o
- OBJS-$(CONFIG_HEVC_QSV_HWACCEL)           += qsvdec.o
--OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o\
--                                             v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o  v4l2_req_hevc_v4.o
-+OBJS-$(CONFIG_HEVC_V4L2REQUEST_HWACCEL)   += v4l2_request_hevc.o v4l2_req_decode_q.o v4l2_req_hevc_v4.o
-+OBJS-$(CONFIG_V4L2_REQ_HEVC_VX)           += v4l2_req_hevc_v1.o v4l2_req_hevc_v2.o v4l2_req_hevc_v3.o
- OBJS-$(CONFIG_HEVC_VAAPI_HWACCEL)         += vaapi_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_HEVC_VDPAU_HWACCEL)         += vdpau_hevc.o h265_profile_level.o
- OBJS-$(CONFIG_MJPEG_NVDEC_HWACCEL)        += nvdec_mjpeg.o
-diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
-index 7e05f6e7c3..7829d82084 100644
---- a/libavcodec/hevc-ctrls-v4.h
-+++ b/libavcodec/hevc-ctrls-v4.h
-@@ -53,6 +53,8 @@
- #include <linux/const.h>
- #include <linux/types.h>
- 
-+#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
-+
- #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
- #define V4L2_CID_STATELESS_HEVC_PPS		(V4L2_CID_CODEC_STATELESS_BASE + 401)
- #define V4L2_CID_STATELESS_HEVC_SLICE_PARAMS	(V4L2_CID_CODEC_STATELESS_BASE + 402)
-diff --git a/libavcodec/v4l2_req_hevc_vx.c b/libavcodec/v4l2_req_hevc_vx.c
-index 5d083016f8..e1bd5c6a1f 100644
---- a/libavcodec/v4l2_req_hevc_vx.c
-+++ b/libavcodec/v4l2_req_hevc_vx.c
-@@ -40,11 +40,6 @@
- #define V4L2_STATELESS_HEVC_START_CODE_ANNEX_B          V4L2_MPEG_VIDEO_HEVC_START_CODE_ANNEX_B
- #endif
- 
--// Should be in videodev2 but we might not have a good enough one
--#ifndef V4L2_PIX_FMT_HEVC_SLICE
--#define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
--#endif
--
- #include "v4l2_request_hevc.h"
- 
- #include "libavutil/hwcontext_drm.h"
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 5cf17dd5e3..614a1b4d99 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -17,7 +17,7 @@
-  */
- 
- 
--
-+#include "config.h"
- #include "decode.h"
- #include "hevcdec.h"
- #include "hwconfig.h"
-@@ -142,7 +142,7 @@ static int v4l2_request_hevc_init(AVCodecContext *avctx)
-     const HEVCSPS * const sps = h->ps.sps;
-     int ret;
-     const struct decdev * decdev;
--    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 1).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
-+    const uint32_t src_pix_fmt = V2(ff_v4l2_req_hevc, 4).src_pix_fmt_v4l2;  // Assuming constant for all APIs but avoiding V4L2 includes
-     size_t src_size;
-     enum mediabufs_memory src_memtype;
-     enum mediabufs_memory dst_memtype;
-@@ -232,6 +232,7 @@ retry_src_memtype:
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 4 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 4);
-     }
-+#if CONFIG_V4L2_REQ_HEVC_VX
-     else if (V2(ff_v4l2_req_hevc, 3).probe(avctx, ctx) == 0) {
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 3 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 3);
-@@ -244,6 +245,7 @@ retry_src_memtype:
-         av_log(avctx, AV_LOG_DEBUG, "HEVC API version 1 probed successfully\n");
-         ctx->fns = &V2(ff_v4l2_req_hevc, 1);
-     }
-+#endif
-     else {
-         av_log(avctx, AV_LOG_ERROR, "No HEVC version probed successfully\n");
-         ret = AVERROR(EINVAL);
-
-From 35ec6af32c4f05b076f84ab343a8fc0d3263ba44 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Sep 2022 17:59:22 +0100
-Subject: [PATCH 066/136] v4l2_m2m_enc: Send headers in in pkt side_data
-
-If GLOBAL_HEADERS are requested then we can't provide them at init time
-so send as NEW_EXTRADATA side data in a similar way to some AV1
-encoders.
----
- libavcodec/v4l2_m2m_enc.c | 33 +++++++++++++++++++++++----------
- 1 file changed, 23 insertions(+), 10 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index 05ff6ba726..099ad23928 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -544,14 +544,12 @@ dequeue:
-         av_freep(&avctx->extradata);
-         avctx->extradata_size = 0;
- 
--        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) != NULL)
--            memcpy(data, avpkt->data, len);
-+        if ((data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
-+            goto fail_no_mem;
- 
-+        memcpy(data, avpkt->data, len);
-         av_packet_unref(avpkt);
- 
--        if (data == NULL)
--            return AVERROR(ENOMEM);
--
-         // We need to copy the header, but keep local if not global
-         if ((avctx->flags & AV_CODEC_FLAG_GLOBAL_HEADER) != 0) {
-             avctx->extradata = data;
-@@ -567,18 +565,28 @@ dequeue:
-     }
- 
-     // First frame must be key so mark as such even if encoder forgot
--    if (capture->first_buf == 2)
-+    if (capture->first_buf == 2) {
-         avpkt->flags |= AV_PKT_FLAG_KEY;
- 
-+        // Add any extradata to the 1st packet we emit as we cannot create it at init
-+        if (avctx->extradata_size > 0 && avctx->extradata) {
-+            void * const side = av_packet_new_side_data(avpkt,
-+                                           AV_PKT_DATA_NEW_EXTRADATA,
-+                                           avctx->extradata_size);
-+            if (!side)
-+                goto fail_no_mem;
-+
-+            memcpy(side, avctx->extradata, avctx->extradata_size);
-+        }
-+    }
-+
-     // Add SPS/PPS to the start of every key frame if non-global headers
-     if ((avpkt->flags & AV_PKT_FLAG_KEY) != 0 && s->extdata_size != 0) {
-         const size_t newlen = s->extdata_size + avpkt->size;
-         AVBufferRef * const buf = av_buffer_alloc(newlen + AV_INPUT_BUFFER_PADDING_SIZE);
- 
--        if (buf == NULL) {
--            av_packet_unref(avpkt);
--            return AVERROR(ENOMEM);
--        }
-+        if (buf == NULL)
-+            goto fail_no_mem;
- 
-         memcpy(buf->data, s->extdata_data, s->extdata_size);
-         memcpy(buf->data + s->extdata_size, avpkt->data, avpkt->size);
-@@ -592,6 +600,11 @@ dequeue:
- //    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
-     capture->first_buf = 0;
-     return 0;
-+
-+fail_no_mem:
-+    ret = AVERROR(ENOMEM);
-+    av_packet_unref(avpkt);
-+    return ret;
- }
- 
- static av_cold int v4l2_encode_init(AVCodecContext *avctx)
-
-From dfc754491cea9192945b92ca9c8d3919321e30ad Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 14 Sep 2022 15:44:10 +0000
-Subject: [PATCH 067/136] matroskaenc: Allow H264 SPS/PPS headers in packet
- sidedata
-
----
- libavformat/matroskaenc.c | 26 ++++++++++++++++++++++----
- 1 file changed, 22 insertions(+), 4 deletions(-)
-
-diff --git a/libavformat/matroskaenc.c b/libavformat/matroskaenc.c
-index 113541bd9a..61e4c976ef 100644
---- a/libavformat/matroskaenc.c
-+++ b/libavformat/matroskaenc.c
-@@ -77,6 +77,10 @@
- 
- #define IS_WEBM(mkv) (CONFIG_WEBM_MUXER && CONFIG_MATROSKA_MUXER ? \
-                       ((mkv)->mode == MODE_WEBM) : CONFIG_WEBM_MUXER)
-+
-+/* Reserved size for H264 headers if not extant at init time */
-+#define MAX_H264_HEADER_SIZE 1024
-+
- #define IS_SEEKABLE(pb, mkv) (((pb)->seekable & AVIO_SEEKABLE_NORMAL) && \
-                               !(mkv)->is_live)
- 
-@@ -1121,8 +1125,12 @@ static int mkv_assemble_native_codecprivate(AVFormatContext *s, AVIOContext *dyn
-     case AV_CODEC_ID_WAVPACK:
-         return put_wv_codecpriv(dyn_cp, extradata, extradata_size);
-     case AV_CODEC_ID_H264:
--        return ff_isom_write_avcc(dyn_cp, extradata,
--                                  extradata_size);
-+        if (par->extradata_size)
-+            return ff_isom_write_avcc(dyn_cp, extradata,
-+                                      extradata_size);
-+        else
-+            *size_to_reserve = MAX_H264_HEADER_SIZE;
-+        break;
-     case AV_CODEC_ID_HEVC:
-         return ff_isom_write_hvcc(dyn_cp, extradata,
-                                   extradata_size, 0);
-@@ -2731,8 +2739,8 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
-         }
-         break;
- #endif
--    // FIXME: Remove the following once libaom starts propagating proper extradata during init()
--    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2208
-+    // FIXME: Remove the following once libaom starts propagating extradata during init()
-+    //        See https://bugs.chromium.org/p/aomedia/issues/detail?id=2012
-     case AV_CODEC_ID_AV1:
-         if (side_data_size && mkv->track.bc && !par->extradata_size) {
-             // If the reserved space doesn't suffice, only write
-@@ -2744,6 +2752,16 @@ static int mkv_check_new_extra_data(AVFormatContext *s, const AVPacket *pkt)
-         } else if (!par->extradata_size)
-             return AVERROR_INVALIDDATA;
-         break;
-+    // H264 V4L2 has a similar issue
-+    case AV_CODEC_ID_H264:
-+        if (side_data_size && mkv->track.bc && !par->extradata_size) {
-+            ret = mkv_update_codecprivate(s, mkv, side_data, side_data_size,
-+                                          par, mkv->track.bc, track, 0);
-+            if (ret < 0)
-+                return ret;
-+        } else if (!par->extradata_size)
-+            return AVERROR_INVALIDDATA;
-+        break;
-     default:
-         if (side_data_size)
-             av_log(s, AV_LOG_DEBUG, "Ignoring new extradata in a packet for stream %d.\n", pkt->stream_index);
-
-From 30c6ca4e24ae2acbd7f7f122f5275beb62b625c6 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 14 Sep 2022 15:55:15 +0000
-Subject: [PATCH 068/136] movenc: Allow H264 SPS/PPS headers in packet sidedata
-
----
- libavformat/movenc.c | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/libavformat/movenc.c b/libavformat/movenc.c
-index c4fcb5f8b1..891adbf7b2 100644
---- a/libavformat/movenc.c
-+++ b/libavformat/movenc.c
-@@ -6343,6 +6343,7 @@ static int mov_write_single_packet(AVFormatContext *s, AVPacket *pkt)
-     if (trk->par->codec_id == AV_CODEC_ID_MP4ALS ||
-             trk->par->codec_id == AV_CODEC_ID_AAC ||
-             trk->par->codec_id == AV_CODEC_ID_AV1 ||
-+            trk->par->codec_id == AV_CODEC_ID_H264 ||
-             trk->par->codec_id == AV_CODEC_ID_FLAC) {
-         size_t side_size;
-         uint8_t *side = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
-
-From 1c7c3e99e9ed90f241aecbe7b2269229587d1e03 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 26 Sep 2022 12:45:05 +0100
-Subject: [PATCH 069/136] Allow ffmpeg to select codec internal hwfmts if
- no_cvt_hw
-
-This allows the selection of DRM_PRIME from v4l2m2m without forcing it
-in the decoder.
-
-Not utterly sure this is the right method for 5.1 but it does work
----
- fftools/ffmpeg.c | 7 +++++--
- 1 file changed, 5 insertions(+), 2 deletions(-)
-
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index ba0c1898cf..839da7b472 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -2763,12 +2763,15 @@ static enum AVPixelFormat get_format(AVCodecContext *s, const enum AVPixelFormat
-             break;
- 
-         if (ist->hwaccel_id == HWACCEL_GENERIC ||
--            ist->hwaccel_id == HWACCEL_AUTO) {
-+            ist->hwaccel_id == HWACCEL_AUTO ||
-+            no_cvt_hw) {
-             for (i = 0;; i++) {
-                 config = avcodec_get_hw_config(s->codec, i);
-                 if (!config)
-                     break;
--                if (!(config->methods &
-+                if (no_cvt_hw && (config->methods & AV_CODEC_HW_CONFIG_METHOD_INTERNAL))
-+                    av_log(s, AV_LOG_DEBUG, "no_cvt_hw so trying pix_fmt %d with codec internal hwaccel\n", *p);
-+                else if (!(config->methods &
-                       AV_CODEC_HW_CONFIG_METHOD_HW_DEVICE_CTX))
-                     continue;
-                 if (config->pix_fmt == *p)
-
-From ecf273fd02e8aafe8775b1f291b9664b1b49572e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 1 Sep 2022 11:42:41 +0000
-Subject: [PATCH 070/136] vf_deinterlace_v4l2m2m: Add a v4l2m2m scaler
-
-The logic for running an isp based scaler is pretty much identical to
-that for the deinterlacer so add to the deinterlacer. This requires
-some rework of the setup code to avoid assumptions that are true for
-deinterlace but not scale but the reworked code requires few switches
-based on operation.
----
- libavfilter/allfilters.c             |    1 +
- libavfilter/vf_deinterlace_v4l2m2m.c | 1123 ++++++++++++++++++++------
- 2 files changed, 877 insertions(+), 247 deletions(-)
-
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index 357ff61ca8..d504fa1bc8 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -421,6 +421,7 @@ extern const AVFilter ff_vf_scale;
- extern const AVFilter ff_vf_scale_cuda;
- extern const AVFilter ff_vf_scale_npp;
- extern const AVFilter ff_vf_scale_qsv;
-+extern const AVFilter ff_vf_scale_v4l2m2m;
- extern const AVFilter ff_vf_scale_vaapi;
- extern const AVFilter ff_vf_scale_vulkan;
- extern const AVFilter ff_vf_scale2ref;
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 1a3bef5bcb..2df39ec0f1 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -52,31 +52,36 @@
- #include "avfilter.h"
- #include "formats.h"
- #include "internal.h"
-+#include "scale_eval.h"
- #include "video.h"
- 
-+#ifndef DRM_FORMAT_P030
-+#define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
-+#endif
-+
- typedef struct V4L2Queue V4L2Queue;
- typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
- 
--typedef struct V4L2PlaneInfo {
--    int bytesperline;
--    size_t length;
--} V4L2PlaneInfo;
-+typedef enum filter_type_v4l2_e
-+{
-+    FILTER_V4L2_DEINTERLACE = 1,
-+    FILTER_V4L2_SCALE,
-+} filter_type_v4l2_t;
- 
- typedef struct V4L2Buffer {
-     int enqueued;
-     int reenqueue;
--    int fd;
-     struct v4l2_buffer buffer;
-     AVFrame frame;
-     struct v4l2_plane planes[VIDEO_MAX_PLANES];
-     int num_planes;
--    V4L2PlaneInfo plane_info[VIDEO_MAX_PLANES];
-     AVDRMFrameDescriptor drm_frame;
-     V4L2Queue *q;
- } V4L2Buffer;
- 
- typedef struct V4L2Queue {
-     struct v4l2_format format;
-+    struct v4l2_selection sel;
-     int num_buffers;
-     V4L2Buffer *buffers;
-     DeintV4L2M2MContextShared *ctx;
-@@ -111,11 +116,18 @@ typedef struct pts_track_s
- 
- typedef struct DeintV4L2M2MContextShared {
-     void * logctx;  // For logging - will be NULL when done
-+    filter_type_v4l2_t filter_type;
- 
-     int fd;
-     int done;
-     int width;
-     int height;
-+
-+    // from options
-+    int output_width;
-+    int output_height;
-+    enum AVPixelFormat output_format;
-+
-     int orig_width;
-     int orig_height;
-     atomic_uint refcount;
-@@ -134,8 +146,60 @@ typedef struct DeintV4L2M2MContext {
-     const AVClass *class;
- 
-     DeintV4L2M2MContextShared *shared;
-+
-+    char * w_expr;
-+    char * h_expr;
-+    char * output_format_string;;
-+
-+    int force_original_aspect_ratio;
-+    int force_divisible_by;
-+
-+    char *colour_primaries_string;
-+    char *colour_transfer_string;
-+    char *colour_matrix_string;
-+    int   colour_range;
-+    char *chroma_location_string;
-+
-+    enum AVColorPrimaries colour_primaries;
-+    enum AVColorTransferCharacteristic colour_transfer;
-+    enum AVColorSpace colour_matrix;
-+    enum AVChromaLocation chroma_location;
- } DeintV4L2M2MContext;
- 
-+// These just list the ones we know we can cope with
-+static uint32_t
-+fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
-+{
-+    switch (avfmt) {
-+    case AV_PIX_FMT_YUV420P:
-+        return V4L2_PIX_FMT_YUV420;
-+    case AV_PIX_FMT_NV12:
-+        return V4L2_PIX_FMT_NV12;
-+    case AV_PIX_FMT_RPI4_8:
-+    case AV_PIX_FMT_SAND128:
-+        return V4L2_PIX_FMT_NV12_COL128;
-+    default:
-+        break;
-+    }
-+    return 0;
-+}
-+
-+static enum AVPixelFormat
-+fmt_v4l2_to_av(const uint32_t pixfmt)
-+{
-+    switch (pixfmt) {
-+    case V4L2_PIX_FMT_YUV420:
-+        return AV_PIX_FMT_YUV420P;
-+    case V4L2_PIX_FMT_NV12:
-+        return AV_PIX_FMT_NV12;
-+    case V4L2_PIX_FMT_NV12_COL128:
-+        return AV_PIX_FMT_RPI4_8;
-+    default:
-+        break;
-+    }
-+    return AV_PIX_FMT_NONE;
-+}
-+
- static unsigned int pts_stats_interval(const pts_stats_t * const stats)
- {
-     return stats->last_interval;
-@@ -301,6 +365,39 @@ static int pts_track_init(pts_track_t * const trk, void *logctx)
-     return 0;
- }
- 
-+static inline uint32_t
-+fmt_bpl(const struct v4l2_format * const fmt, const unsigned int plane_n)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.plane_fmt[plane_n].bytesperline : fmt->fmt.pix.bytesperline;
-+}
-+
-+static inline uint32_t
-+fmt_height(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.height : fmt->fmt.pix.height;
-+}
-+
-+static inline uint32_t
-+fmt_width(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.width : fmt->fmt.pix.width;
-+}
-+
-+static inline uint32_t
-+fmt_pixelformat(const struct v4l2_format * const fmt)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
-+}
-+
-+static void
-+init_format(V4L2Queue * const q, const uint32_t format_type)
-+{
-+    memset(&q->format, 0, sizeof(q->format));
-+    memset(&q->sel,    0, sizeof(q->sel));
-+    q->format.type = format_type;
-+    q->sel.type    = format_type;
-+}
-+
- static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
- {
-     struct v4l2_capability cap;
-@@ -311,80 +408,99 @@ static int deint_v4l2m2m_prepare_context(DeintV4L2M2MContextShared *ctx)
-     if (ret < 0)
-         return ret;
- 
--    if (!(cap.capabilities & V4L2_CAP_STREAMING))
-+    if (ctx->filter_type == FILTER_V4L2_SCALE &&
-+        strcmp("bcm2835-codec-isp", cap.card) != 0)
-+    {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not ISP\n");
-         return AVERROR(EINVAL);
-+    }
- 
--    if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
--        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
--        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
--
--        return 0;
-+    if (!(cap.capabilities & V4L2_CAP_STREAMING)) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "No streaming\n");
-+        return AVERROR(EINVAL);
-     }
- 
-     if (cap.capabilities & V4L2_CAP_VIDEO_M2M_MPLANE) {
--        ctx->capture.format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
--        ctx->output.format.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
--
--        return 0;
-+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE);
-+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE);
-+    }
-+    else if (cap.capabilities & V4L2_CAP_VIDEO_M2M) {
-+        init_format(&ctx->capture, V4L2_BUF_TYPE_VIDEO_CAPTURE);
-+        init_format(&ctx->output,  V4L2_BUF_TYPE_VIDEO_OUTPUT);
-+    }
-+    else {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Not M2M\n");
-+        return AVERROR(EINVAL);
-     }
- 
--    return AVERROR(EINVAL);
-+    return 0;
- }
- 
--static int deint_v4l2m2m_try_format(V4L2Queue *queue)
-+// Just use for probe - doesn't modify q format
-+static int deint_v4l2m2m_try_format(V4L2Queue *queue, const uint32_t width, const uint32_t height, const enum AVPixelFormat avfmt)
- {
--    struct v4l2_format *fmt        = &queue->format;
-+    struct v4l2_format fmt         = {.type = queue->format.type};
-     DeintV4L2M2MContextShared *ctx = queue->ctx;
-     int ret, field;
-+    // Pick YUV to test with if not otherwise specified
-+    uint32_t pixelformat = avfmt == AV_PIX_FMT_NONE ? V4L2_PIX_FMT_YUV420 : fmt_av_to_v4l2(avfmt);
-+    enum AVPixelFormat r_avfmt;
-+
- 
--    ret = ioctl(ctx->fd, VIDIOC_G_FMT, fmt);
-+    ret = ioctl(ctx->fd, VIDIOC_G_FMT, &fmt);
-     if (ret)
-         av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_G_FMT failed: %d\n", ret);
- 
--    if (V4L2_TYPE_IS_OUTPUT(fmt->type))
-+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && V4L2_TYPE_IS_OUTPUT(fmt.type))
-         field = V4L2_FIELD_INTERLACED_TB;
-     else
-         field = V4L2_FIELD_NONE;
- 
--    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
--        fmt->fmt.pix_mp.pixelformat = V4L2_PIX_FMT_YUV420;
--        fmt->fmt.pix_mp.field = field;
--        fmt->fmt.pix_mp.width = ctx->width;
--        fmt->fmt.pix_mp.height = ctx->height;
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
-+        fmt.fmt.pix_mp.pixelformat = pixelformat;
-+        fmt.fmt.pix_mp.field = field;
-+        fmt.fmt.pix_mp.width = width;
-+        fmt.fmt.pix_mp.height = height;
-     } else {
--        fmt->fmt.pix.pixelformat = V4L2_PIX_FMT_YUV420;
--        fmt->fmt.pix.field = field;
--        fmt->fmt.pix.width = ctx->width;
--        fmt->fmt.pix.height = ctx->height;
-+        fmt.fmt.pix.pixelformat = pixelformat;
-+        fmt.fmt.pix.field = field;
-+        fmt.fmt.pix.width = width;
-+        fmt.fmt.pix.height = height;
-     }
- 
--    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
--		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
--		 fmt->fmt.pix_mp.pixelformat,
--		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u pre\n", __func__,
-+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
-+         fmt.fmt.pix_mp.pixelformat,
-+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
- 
--    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, fmt);
-+    ret = ioctl(ctx->fd, VIDIOC_TRY_FMT, &fmt);
-     if (ret)
-         return AVERROR(EINVAL);
- 
--    av_log(ctx->logctx, AV_LOG_DEBUG, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
--		 fmt->type, fmt->fmt.pix_mp.width, fmt->fmt.pix_mp.height,
--		 fmt->fmt.pix_mp.pixelformat,
--		 fmt->fmt.pix_mp.plane_fmt[0].sizeimage, fmt->fmt.pix_mp.plane_fmt[0].bytesperline);
-+    av_log(ctx->logctx, AV_LOG_TRACE, "%s: Trying format for type %d, wxh: %dx%d, fmt: %08x, size %u bpl %u post\n", __func__,
-+         fmt.type, fmt.fmt.pix_mp.width, fmt.fmt.pix_mp.height,
-+         fmt.fmt.pix_mp.pixelformat,
-+         fmt.fmt.pix_mp.plane_fmt[0].sizeimage, fmt.fmt.pix_mp.plane_fmt[0].bytesperline);
- 
--    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
--        if ((fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_YUV420 &&
--             fmt->fmt.pix_mp.pixelformat != V4L2_PIX_FMT_NV12) ||
--            fmt->fmt.pix_mp.field != field) {
--            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+    r_avfmt = fmt_v4l2_to_av(fmt_pixelformat(&fmt));
-+    if (r_avfmt != avfmt && avfmt != AV_PIX_FMT_NONE) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Unable to set format %s on %s port\n", av_get_pix_fmt_name(avfmt), V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
-+        return AVERROR(EINVAL);
-+    }
-+    if (r_avfmt == AV_PIX_FMT_NONE) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "No supported format on %s port\n", V4L2_TYPE_IS_CAPTURE(fmt.type) ? "dest" : "src");
-+        return AVERROR(EINVAL);
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
-+        if (fmt.fmt.pix_mp.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
- 
-             return AVERROR(EINVAL);
-         }
-     } else {
--        if ((fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_YUV420 &&
--             fmt->fmt.pix.pixelformat != V4L2_PIX_FMT_NV12) ||
--            fmt->fmt.pix.field != field) {
--            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt->type);
-+        if (fmt.fmt.pix.field != field) {
-+            av_log(ctx->logctx, AV_LOG_DEBUG, "format not supported for type %d\n", fmt.type);
- 
-             return AVERROR(EINVAL);
-         }
-@@ -393,68 +509,410 @@ static int deint_v4l2m2m_try_format(V4L2Queue *queue)
-     return 0;
- }
- 
--static int deint_v4l2m2m_set_format(V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height, int pitch, int ysize)
-+static int
-+do_s_fmt(V4L2Queue * const q)
- {
--    struct v4l2_format *fmt        = &queue->format;
--    DeintV4L2M2MContextShared *ctx = queue->ctx;
-+    DeintV4L2M2MContextShared * const ctx = q->ctx;
-+    const uint32_t pixelformat = fmt_pixelformat(&q->format);
-     int ret;
- 
--    struct v4l2_selection sel = {
--        .type = fmt->type,
--        .target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP_BOUNDS : V4L2_SEL_TGT_COMPOSE_BOUNDS,
--    };
--
--    // This works for most single object 4:2:0 types
--    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
--        fmt->fmt.pix_mp.pixelformat = pixelformat;
--        fmt->fmt.pix_mp.field = field;
--        fmt->fmt.pix_mp.width = width;
--        fmt->fmt.pix_mp.height = ysize / pitch;
--        fmt->fmt.pix_mp.plane_fmt[0].bytesperline = pitch;
--        fmt->fmt.pix_mp.plane_fmt[0].sizeimage = ysize + (ysize >> 1);
--    } else {
--        fmt->fmt.pix.pixelformat = pixelformat;
--        fmt->fmt.pix.field = field;
--        fmt->fmt.pix.width = width;
--        fmt->fmt.pix.height = height;
--        fmt->fmt.pix.sizeimage = 0;
--        fmt->fmt.pix.bytesperline = 0;
--    }
--
--    ret = ioctl(ctx->fd, VIDIOC_S_FMT, fmt);
-+    ret = ioctl(ctx->fd, VIDIOC_S_FMT, &q->format);
-     if (ret) {
-         ret = AVERROR(errno);
--        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %d\n", ret);
-+        av_log(ctx->logctx, AV_LOG_ERROR, "VIDIOC_S_FMT failed: %s\n", av_err2str(ret));
-         return ret;
-     }
- 
--    if (pixelformat != fmt->fmt.pix.pixelformat) {
--        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt->fmt.pix.pixelformat));
-+    if (pixelformat != fmt_pixelformat(&q->format)) {
-+        av_log(ctx->logctx, AV_LOG_ERROR, "Format not supported: %s; S_FMT returned %s\n", av_fourcc2str(pixelformat), av_fourcc2str(fmt_pixelformat(&q->format)));
-         return AVERROR(EINVAL);
-     }
- 
--    ret = ioctl(ctx->fd, VIDIOC_G_SELECTION, &sel);
-+    q->sel.target = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
-+    q->sel.flags  = V4L2_TYPE_IS_OUTPUT(q->sel.type) ? V4L2_SEL_FLAG_LE : V4L2_SEL_FLAG_GE;
-+
-+    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &q->sel);
-     if (ret) {
-         ret = AVERROR(errno);
--        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_G_SELECTION failed: %d\n", ret);
-+        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %s\n", av_err2str(ret));
-     }
- 
--    sel.r.width = width;
--    sel.r.height = height;
--    sel.r.left = 0;
--    sel.r.top = 0;
--    sel.target = V4L2_TYPE_IS_OUTPUT(fmt->type) ? V4L2_SEL_TGT_CROP : V4L2_SEL_TGT_COMPOSE,
--    sel.flags = V4L2_SEL_FLAG_LE;
-+    return 0;
-+}
- 
--    ret = ioctl(ctx->fd, VIDIOC_S_SELECTION, &sel);
--    if (ret) {
--        ret = AVERROR(errno);
--        av_log(ctx->logctx, AV_LOG_WARNING, "VIDIOC_S_SELECTION failed: %d\n", ret);
-+static void
-+set_fmt_color(struct v4l2_format *const fmt,
-+               const enum AVColorPrimaries avcp,
-+               const enum AVColorSpace avcs,
-+               const enum AVColorTransferCharacteristic avxc)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr = V4L2_YCBCR_ENC_DEFAULT;
-+    enum v4l2_colorspace cs = V4L2_COLORSPACE_DEFAULT;
-+    enum v4l2_xfer_func xfer = V4L2_XFER_FUNC_DEFAULT;
-+
-+    switch (avcp) {
-+    case AVCOL_PRI_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        ycbcr = V4L2_YCBCR_ENC_709;
-+        break;
-+    case AVCOL_PRI_BT470M:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        ycbcr = V4L2_YCBCR_ENC_601;
-+        break;
-+    case AVCOL_PRI_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_PRI_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_PRI_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_PRI_BT2020:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    case AVCOL_PRI_SMPTE428:
-+    case AVCOL_PRI_SMPTE431:
-+    case AVCOL_PRI_SMPTE432:
-+    case AVCOL_PRI_EBU3213:
-+    case AVCOL_PRI_RESERVED:
-+    case AVCOL_PRI_FILM:
-+    case AVCOL_PRI_UNSPECIFIED:
-+    default:
-+        break;
-+    }
-+
-+    switch (avcs) {
-+    case AVCOL_SPC_RGB:
-+        cs = V4L2_COLORSPACE_SRGB;
-+        break;
-+    case AVCOL_SPC_BT709:
-+        cs = V4L2_COLORSPACE_REC709;
-+        break;
-+    case AVCOL_SPC_FCC:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_M;
-+        break;
-+    case AVCOL_SPC_BT470BG:
-+        cs = V4L2_COLORSPACE_470_SYSTEM_BG;
-+        break;
-+    case AVCOL_SPC_SMPTE170M:
-+        cs = V4L2_COLORSPACE_SMPTE170M;
-+        break;
-+    case AVCOL_SPC_SMPTE240M:
-+        cs = V4L2_COLORSPACE_SMPTE240M;
-+        break;
-+    case AVCOL_SPC_BT2020_CL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        ycbcr = V4L2_YCBCR_ENC_BT2020_CONST_LUM;
-+        break;
-+    case AVCOL_SPC_BT2020_NCL:
-+        cs = V4L2_COLORSPACE_BT2020;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    switch (xfer) {
-+    case AVCOL_TRC_BT709:
-+        xfer = V4L2_XFER_FUNC_709;
-+        break;
-+    case AVCOL_TRC_IEC61966_2_1:
-+        xfer = V4L2_XFER_FUNC_SRGB;
-+        break;
-+    case AVCOL_TRC_SMPTE240M:
-+        xfer = V4L2_XFER_FUNC_SMPTE240M;
-+        break;
-+    case AVCOL_TRC_SMPTE2084:
-+        xfer = V4L2_XFER_FUNC_SMPTE2084;
-+        break;
-+    default:
-+        break;
-+    }
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.colorspace = cs;
-+        fmt->fmt.pix_mp.ycbcr_enc = ycbcr;
-+        fmt->fmt.pix_mp.xfer_func = xfer;
-+    } else {
-+        fmt->fmt.pix.colorspace = cs;
-+        fmt->fmt.pix.ycbcr_enc = ycbcr;
-+        fmt->fmt.pix.xfer_func = xfer;
-+    }
-+}
-+
-+static void
-+set_fmt_color_range(struct v4l2_format *const fmt, const enum AVColorRange avcr)
-+{
-+    const enum v4l2_quantization q =
-+        avcr == AVCOL_RANGE_MPEG ? V4L2_QUANTIZATION_LIM_RANGE :
-+        avcr == AVCOL_RANGE_JPEG ? V4L2_QUANTIZATION_FULL_RANGE :
-+            V4L2_QUANTIZATION_DEFAULT;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.quantization = q;
-+    } else {
-+        fmt->fmt.pix.quantization = q;
-+    }
-+}
-+
-+static enum AVColorPrimaries get_color_primaries(const struct v4l2_format *const fmt)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr;
-+    enum v4l2_colorspace cs;
-+
-+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.colorspace :
-+        fmt->fmt.pix.colorspace;
-+
-+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.ycbcr_enc:
-+        fmt->fmt.pix.ycbcr_enc;
-+
-+    switch(ycbcr) {
-+    case V4L2_YCBCR_ENC_XV709:
-+    case V4L2_YCBCR_ENC_709: return AVCOL_PRI_BT709;
-+    case V4L2_YCBCR_ENC_XV601:
-+    case V4L2_YCBCR_ENC_601:return AVCOL_PRI_BT470M;
-+    default:
-+        break;
-+    }
-+
-+    switch(cs) {
-+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_PRI_BT470BG;
-+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_PRI_SMPTE170M;
-+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_PRI_SMPTE240M;
-+    case V4L2_COLORSPACE_BT2020: return AVCOL_PRI_BT2020;
-+    default:
-+        break;
-+    }
-+
-+    return AVCOL_PRI_UNSPECIFIED;
-+}
-+
-+static enum AVColorSpace get_color_space(const struct v4l2_format *const fmt)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr;
-+    enum v4l2_colorspace cs;
-+
-+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.colorspace :
-+        fmt->fmt.pix.colorspace;
-+
-+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.ycbcr_enc:
-+        fmt->fmt.pix.ycbcr_enc;
-+
-+    switch(cs) {
-+    case V4L2_COLORSPACE_SRGB: return AVCOL_SPC_RGB;
-+    case V4L2_COLORSPACE_REC709: return AVCOL_SPC_BT709;
-+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_SPC_FCC;
-+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_SPC_BT470BG;
-+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_SPC_SMPTE170M;
-+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_SPC_SMPTE240M;
-+    case V4L2_COLORSPACE_BT2020:
-+        if (ycbcr == V4L2_YCBCR_ENC_BT2020_CONST_LUM)
-+            return AVCOL_SPC_BT2020_CL;
-+        else
-+             return AVCOL_SPC_BT2020_NCL;
-+    default:
-+        break;
-+    }
-+
-+    return AVCOL_SPC_UNSPECIFIED;
-+}
-+
-+static enum AVColorTransferCharacteristic get_color_trc(const struct v4l2_format *const fmt)
-+{
-+    enum v4l2_ycbcr_encoding ycbcr;
-+    enum v4l2_xfer_func xfer;
-+    enum v4l2_colorspace cs;
-+
-+    cs = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.colorspace :
-+        fmt->fmt.pix.colorspace;
-+
-+    ycbcr = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.ycbcr_enc:
-+        fmt->fmt.pix.ycbcr_enc;
-+
-+    xfer = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.xfer_func:
-+        fmt->fmt.pix.xfer_func;
-+
-+    switch (xfer) {
-+    case V4L2_XFER_FUNC_709: return AVCOL_TRC_BT709;
-+    case V4L2_XFER_FUNC_SRGB: return AVCOL_TRC_IEC61966_2_1;
-+    default:
-+        break;
-+    }
-+
-+    switch (cs) {
-+    case V4L2_COLORSPACE_470_SYSTEM_M: return AVCOL_TRC_GAMMA22;
-+    case V4L2_COLORSPACE_470_SYSTEM_BG: return AVCOL_TRC_GAMMA28;
-+    case V4L2_COLORSPACE_SMPTE170M: return AVCOL_TRC_SMPTE170M;
-+    case V4L2_COLORSPACE_SMPTE240M: return AVCOL_TRC_SMPTE240M;
-+    default:
-+        break;
-+    }
-+
-+    switch (ycbcr) {
-+    case V4L2_YCBCR_ENC_XV709:
-+    case V4L2_YCBCR_ENC_XV601: return AVCOL_TRC_BT1361_ECG;
-+    default:
-+        break;
-+    }
-+
-+    return AVCOL_TRC_UNSPECIFIED;
-+}
-+
-+static enum AVColorRange get_color_range(const struct v4l2_format *const fmt)
-+{
-+    enum v4l2_quantization qt;
-+
-+    qt = V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ?
-+        fmt->fmt.pix_mp.quantization :
-+        fmt->fmt.pix.quantization;
-+
-+    switch (qt) {
-+    case V4L2_QUANTIZATION_LIM_RANGE: return AVCOL_RANGE_MPEG;
-+    case V4L2_QUANTIZATION_FULL_RANGE: return AVCOL_RANGE_JPEG;
-+    default:
-+        break;
-+    }
-+
-+     return AVCOL_RANGE_UNSPECIFIED;
-+}
-+
-+static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
-+{
-+    struct v4l2_format *const format = &q->format;
-+    const AVDRMFrameDescriptor *const src = (const AVDRMFrameDescriptor *)frame->data[0];
-+
-+    const uint32_t drm_fmt = src->layers[0].format;
-+    // Treat INVALID as LINEAR
-+    const uint64_t mod = src->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID ?
-+        DRM_FORMAT_MOD_LINEAR : src->objects[0].format_modifier;
-+    uint32_t pix_fmt = 0;
-+    uint32_t w = 0;
-+    uint32_t h = 0;
-+    uint32_t bpl = src->layers[0].planes[0].pitch;
-+
-+    // We really don't expect multiple layers
-+    // All formats that we currently cope with are single object
-+
-+    if (src->nb_layers != 1 || src->nb_objects != 1)
-+        return AVERROR(EINVAL);
-+
-+    switch (drm_fmt) {
-+        case DRM_FORMAT_YUV420:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 3)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_YUV420;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            break;
-+
-+        case DRM_FORMAT_NV12:
-+            if (mod == DRM_FORMAT_MOD_LINEAR) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12;
-+                h = src->layers[0].planes[1].offset / bpl;
-+                w = bpl;
-+            }
-+            else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt = V4L2_PIX_FMT_NV12_COL128;
-+                w = bpl;
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        case DRM_FORMAT_P030:
-+            if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-+                if (src->layers[0].nb_planes != 2)
-+                    break;
-+                pix_fmt =  V4L2_PIX_FMT_NV12_10_COL128;
-+                w = bpl / 2;  // Matching lie to how we construct this
-+                h = src->layers[0].planes[1].offset / 128;
-+                bpl = fourcc_mod_broadcom_param(mod);
-+            }
-+            break;
-+
-+        default:
-+            break;
-+    }
-+
-+    if (!pix_fmt)
-+        return AVERROR(EINVAL);
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(format->type)) {
-+        struct v4l2_pix_format_mplane *const pix = &format->fmt.pix_mp;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->plane_fmt[0].bytesperline = bpl;
-+        pix->num_planes = 1;
-+    }
-+    else {
-+        struct v4l2_pix_format *const pix = &format->fmt.pix;
-+
-+        pix->width = w;
-+        pix->height = h;
-+        pix->pixelformat = pix_fmt;
-+        pix->bytesperline = bpl;
-     }
- 
-+    set_fmt_color(format, frame->color_primaries, frame->colorspace, frame->color_trc);
-+    set_fmt_color_range(format, frame->color_range);
-+
-+    q->sel.r.width = frame->width - (frame->crop_left + frame->crop_right);
-+    q->sel.r.height = frame->height - (frame->crop_top + frame->crop_bottom);
-+    q->sel.r.left = frame->crop_left;
-+    q->sel.r.top = frame->crop_top;
-+
-     return 0;
- }
- 
-+
-+static int set_dst_format(DeintV4L2M2MContext * const priv, V4L2Queue *queue, uint32_t pixelformat, uint32_t field, int width, int height)
-+{
-+    struct v4l2_format * const fmt   = &queue->format;
-+    struct v4l2_selection *const sel = &queue->sel;
-+
-+    memset(&fmt->fmt, 0, sizeof(fmt->fmt));
-+
-+    // Align w/h to 16 here in case there are alignment requirements at the next
-+    // stage of the filter chain (also RPi deinterlace setup is bust and this
-+    // fixes it)
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt->type)) {
-+        fmt->fmt.pix_mp.pixelformat = pixelformat;
-+        fmt->fmt.pix_mp.field = field;
-+        fmt->fmt.pix_mp.width = FFALIGN(width, 16);
-+        fmt->fmt.pix_mp.height = FFALIGN(height, 16);
-+    } else {
-+        fmt->fmt.pix.pixelformat = pixelformat;
-+        fmt->fmt.pix.field = field;
-+        fmt->fmt.pix.width = FFALIGN(width, 16);
-+        fmt->fmt.pix.height = FFALIGN(height, 16);
-+    }
-+
-+    set_fmt_color(fmt, priv->colour_primaries, priv->colour_matrix, priv->colour_transfer);
-+    set_fmt_color_range(fmt, priv->colour_range);
-+
-+    sel->r.width = width;
-+    sel->r.height = height;
-+    sel->r.left = 0;
-+    sel->r.top = 0;
-+
-+    return do_s_fmt(queue);
-+}
-+
- static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node)
- {
-     int ret;
-@@ -464,16 +922,22 @@ static int deint_v4l2m2m_probe_device(DeintV4L2M2MContextShared *ctx, char *node
-         return AVERROR(errno);
- 
-     ret = deint_v4l2m2m_prepare_context(ctx);
--    if (ret)
-+    if (ret) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to prepare context\n");
-         goto fail;
-+    }
- 
--    ret = deint_v4l2m2m_try_format(&ctx->capture);
--    if (ret)
-+    ret = deint_v4l2m2m_try_format(&ctx->capture, ctx->output_width, ctx->output_height, ctx->output_format);
-+    if (ret) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try dst format\n");
-         goto fail;
-+    }
- 
--    ret = deint_v4l2m2m_try_format(&ctx->output);
--    if (ret)
-+    ret = deint_v4l2m2m_try_format(&ctx->output, ctx->width, ctx->height, AV_PIX_FMT_NONE);
-+    if (ret) {
-+        av_log(ctx->logctx, AV_LOG_DEBUG, "Failed to try src format\n");
-         goto fail;
-+    }
- 
-     return 0;
- 
-@@ -534,26 +998,118 @@ static int deint_v4l2m2m_enqueue_buffer(V4L2Buffer *buf)
-     return 0;
- }
- 
--static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
-+static void
-+drm_frame_init(AVDRMFrameDescriptor * const d)
-+{
-+    unsigned int i;
-+    for (i = 0; i != AV_DRM_MAX_PLANES; ++i) {
-+        d->objects[i].fd = -1;
-+    }
-+}
-+
-+static void
-+drm_frame_uninit(AVDRMFrameDescriptor * const d)
-+{
-+    unsigned int i;
-+    for (i = 0; i != d->nb_objects; ++i) {
-+        if (d->objects[i].fd != -1) {
-+            close(d->objects[i].fd);
-+            d->objects[i].fd = -1;
-+        }
-+    }
-+}
-+
-+static void
-+avbufs_delete(V4L2Buffer** ppavbufs, const unsigned int n)
-+{
-+    unsigned int i;
-+    V4L2Buffer* const avbufs = *ppavbufs;
-+
-+    if (avbufs == NULL)
-+        return;
-+    *ppavbufs = NULL;
-+
-+    for (i = 0; i != n; ++i) {
-+        V4L2Buffer* const avbuf = avbufs + i;
-+        drm_frame_uninit(&avbuf->drm_frame);
-+    }
-+
-+    av_free(avbufs);
-+}
-+
-+static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
- {
-     struct v4l2_exportbuffer expbuf;
-     int i, ret;
-     uint64_t mod = DRM_FORMAT_MOD_LINEAR;
--    uint32_t fmt = 0;
- 
--    switch (pixelformat) {
--    case V4L2_PIX_FMT_NV12:
--        fmt = DRM_FORMAT_NV12;
--        break;
--    case V4L2_PIX_FMT_YUV420:
--        fmt = DRM_FORMAT_YUV420;
--        break;
--    default:
--        return AVERROR(EINVAL);
-+    AVDRMFrameDescriptor * const drm_desc = &avbuf->drm_frame;
-+    AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
-+    const struct v4l2_format *const fmt = &q->format;
-+    const uint32_t height = fmt_height(fmt);
-+    const uint32_t width  = fmt_width(fmt);
-+    ptrdiff_t bpl0;
-+
-+    /* fill the DRM frame descriptor */
-+    drm_desc->nb_layers = 1;
-+    layer->nb_planes = avbuf->num_planes;
-+
-+    for (int i = 0; i < avbuf->num_planes; i++) {
-+        layer->planes[i].object_index = i;
-+        layer->planes[i].offset = 0;
-+        layer->planes[i].pitch = fmt_bpl(fmt, i);
-     }
-+    bpl0 = layer->planes[0].pitch;
-+
-+    switch (fmt_pixelformat(fmt)) {
-+
-+        case V4L2_PIX_FMT_NV12_COL128:
-+            mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
-+            layer->format = V4L2_PIX_FMT_NV12;
-+
-+            if (avbuf->num_planes > 1)
-+                break;
-+
-+            layer->nb_planes = 2;
-+            layer->planes[1].object_index = 0;
-+            layer->planes[1].offset = height * 128;
-+            layer->planes[0].pitch = width;
-+            layer->planes[1].pitch = width;
-+            break;
- 
--    avbuf->drm_frame.layers[0].format = fmt;
-+        case DRM_FORMAT_NV12:
-+            layer->format = V4L2_PIX_FMT_NV12;
- 
-+            if (avbuf->num_planes > 1)
-+                break;
-+
-+            layer->nb_planes = 2;
-+            layer->planes[1].object_index = 0;
-+            layer->planes[1].offset = bpl0 * height;
-+            layer->planes[1].pitch = bpl0;
-+            break;
-+
-+        case V4L2_PIX_FMT_YUV420:
-+            layer->format = DRM_FORMAT_YUV420;
-+
-+            if (avbuf->num_planes > 1)
-+                break;
-+
-+            layer->nb_planes = 3;
-+            layer->planes[1].object_index = 0;
-+            layer->planes[1].offset = bpl0 * height;
-+            layer->planes[1].pitch = bpl0 / 2;
-+            layer->planes[2].object_index = 0;
-+            layer->planes[2].offset = layer->planes[1].offset + ((bpl0 * height) / 4);
-+            layer->planes[2].pitch = bpl0 / 2;
-+            break;
-+
-+        default:
-+            drm_desc->nb_layers = 0;
-+            return AVERROR(EINVAL);
-+    }
-+
-+    drm_desc->nb_objects = 0;
-     for (i = 0; i < avbuf->num_planes; i++) {
-         memset(&expbuf, 0, sizeof(expbuf));
- 
-@@ -565,19 +1121,11 @@ static int v4l2_buffer_export_drm(V4L2Buffer* avbuf, const uint32_t pixelformat)
-         if (ret < 0)
-             return AVERROR(errno);
- 
--        avbuf->fd = expbuf.fd;
--
--        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type)) {
--            /* drm frame */
--            avbuf->drm_frame.objects[i].size = avbuf->buffer.m.planes[i].length;
--            avbuf->drm_frame.objects[i].fd = expbuf.fd;
--            avbuf->drm_frame.objects[i].format_modifier = mod;
--        } else {
--            /* drm frame */
--            avbuf->drm_frame.objects[0].size = avbuf->buffer.length;
--            avbuf->drm_frame.objects[0].fd = expbuf.fd;
--            avbuf->drm_frame.objects[0].format_modifier = mod;
--        }
-+        drm_desc->objects[i].size = V4L2_TYPE_IS_MULTIPLANAR(avbuf->buffer.type) ?
-+            avbuf->buffer.m.planes[i].length : avbuf->buffer.length;
-+        drm_desc->objects[i].fd = expbuf.fd;
-+        drm_desc->objects[i].format_modifier = mod;
-+        drm_desc->nb_objects = i + 1;
-     }
- 
-     return 0;
-@@ -588,7 +1136,7 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-     struct v4l2_format *fmt = &queue->format;
-     DeintV4L2M2MContextShared *ctx = queue->ctx;
-     struct v4l2_requestbuffers req;
--    int ret, i, j, multiplanar;
-+    int ret, i, multiplanar;
-     uint32_t memory;
- 
-     memory = V4L2_TYPE_IS_OUTPUT(fmt->type) ?
-@@ -617,10 +1165,9 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-     }
- 
-     for (i = 0; i < queue->num_buffers; i++) {
--        V4L2Buffer *buf = &queue->buffers[i];
-+        V4L2Buffer * const buf = &queue->buffers[i];
- 
-         buf->enqueued = 0;
--        buf->fd = -1;
-         buf->q = queue;
- 
-         buf->buffer.type = fmt->type;
-@@ -632,6 +1179,12 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-             buf->buffer.m.planes = buf->planes;
-         }
- 
-+        drm_frame_init(&buf->drm_frame);
-+    }
-+
-+    for (i = 0; i < queue->num_buffers; i++) {
-+        V4L2Buffer * const buf = &queue->buffers[i];
-+
-         ret = ioctl(ctx->fd, VIDIOC_QUERYBUF, &buf->buffer);
-         if (ret < 0) {
-             ret = AVERROR(errno);
-@@ -639,29 +1192,14 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-             goto fail;
-         }
- 
--        if (multiplanar)
--            buf->num_planes = buf->buffer.length;
--        else
--            buf->num_planes = 1;
--
--        for (j = 0; j < buf->num_planes; j++) {
--            V4L2PlaneInfo *info = &buf->plane_info[j];
--
--            if (multiplanar) {
--                info->bytesperline = fmt->fmt.pix_mp.plane_fmt[j].bytesperline;
--                info->length = buf->buffer.m.planes[j].length;
--            } else {
--                info->bytesperline = fmt->fmt.pix.bytesperline;
--                info->length = buf->buffer.length;
--            }
--        }
-+        buf->num_planes = multiplanar ? buf->buffer.length : 1;
- 
-         if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) {
-             ret = deint_v4l2m2m_enqueue_buffer(buf);
-             if (ret)
-                 goto fail;
- 
--            ret = v4l2_buffer_export_drm(buf, multiplanar ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat);
-+            ret = v4l2_buffer_export_drm(queue, buf);
-             if (ret)
-                 goto fail;
-         }
-@@ -670,12 +1208,8 @@ static int deint_v4l2m2m_allocate_buffers(V4L2Queue *queue)
-     return 0;
- 
- fail:
--    for (i = 0; i < queue->num_buffers; i++)
--        if (queue->buffers[i].fd >= 0)
--            close(queue->buffers[i].fd);
--    av_free(queue->buffers);
--    queue->buffers = NULL;
--
-+    avbufs_delete(&queue->buffers, queue->num_buffers);
-+    queue->num_buffers = 0;
-     return ret;
- }
- 
-@@ -862,7 +1396,6 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
-     if (atomic_fetch_sub(&ctx->refcount, 1) == 1) {
-         V4L2Queue *capture = &ctx->capture;
-         V4L2Queue *output  = &ctx->output;
--        int i;
- 
-         av_log(NULL, AV_LOG_DEBUG, "%s - destroying context\n", __func__);
- 
-@@ -871,12 +1404,7 @@ static void deint_v4l2m2m_destroy_context(DeintV4L2M2MContextShared *ctx)
-             deint_v4l2m2m_streamoff(output);
-         }
- 
--        if (capture->buffers)
--            for (i = 0; i < capture->num_buffers; i++) {
--                capture->buffers[i].q = NULL;
--                if (capture->buffers[i].fd >= 0)
--                    close(capture->buffers[i].fd);
--            }
-+        avbufs_delete(&capture->buffers, capture->num_buffers);
- 
-         deint_v4l2m2m_unref_queued(output);
- 
-@@ -908,73 +1436,15 @@ static void v4l2_free_buffer(void *opaque, uint8_t *unused)
-     deint_v4l2m2m_destroy_context(ctx);
- }
- 
--static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf, int height)
--{
--    AVDRMFrameDescriptor *drm_desc = &avbuf->drm_frame;
--    AVDRMLayerDescriptor *layer;
--
--    /* fill the DRM frame descriptor */
--    drm_desc->nb_objects = avbuf->num_planes;
--    drm_desc->nb_layers = 1;
--
--    layer = &drm_desc->layers[0];
--    layer->nb_planes = avbuf->num_planes;
--
--    for (int i = 0; i < avbuf->num_planes; i++) {
--        layer->planes[i].object_index = i;
--        layer->planes[i].offset = 0;
--        layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
--    }
--
--    switch (layer->format) {
--    case DRM_FORMAT_YUYV:
--        layer->nb_planes = 1;
--        break;
--
--    case DRM_FORMAT_NV12:
--    case DRM_FORMAT_NV21:
--        if (avbuf->num_planes > 1)
--            break;
--
--        layer->nb_planes = 2;
--
--        layer->planes[1].object_index = 0;
--        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
--            height;
--        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline;
--        break;
--
--    case DRM_FORMAT_YUV420:
--        if (avbuf->num_planes > 1)
--            break;
--
--        layer->nb_planes = 3;
--
--        layer->planes[1].object_index = 0;
--        layer->planes[1].offset = avbuf->plane_info[0].bytesperline *
--            height;
--        layer->planes[1].pitch = avbuf->plane_info[0].bytesperline >> 1;
--
--        layer->planes[2].object_index = 0;
--        layer->planes[2].offset = layer->planes[1].offset +
--            ((avbuf->plane_info[0].bytesperline *
--              height) >> 2);
--        layer->planes[2].pitch = avbuf->plane_info[0].bytesperline >> 1;
--        break;
--
--    default:
--        drm_desc->nb_layers = 0;
--        break;
--    }
--
--    return (uint8_t *) drm_desc;
--}
--
- // timeout in ms
- static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int timeout)
- {
-     DeintV4L2M2MContextShared *ctx = queue->ctx;
-     V4L2Buffer* avbuf;
-+    enum AVColorPrimaries color_primaries;
-+    enum AVColorSpace colorspace;
-+    enum AVColorTransferCharacteristic color_trc;
-+    enum AVColorRange color_range;
- 
-     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
- 
-@@ -985,8 +1455,6 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
-     }
- 
-     // Fill in PTS and anciliary info from src frame
--    // we will want to overwrite some fields as only the pts/dts
--    // fields are updated with new timing in this fn
-     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
- 
-     frame->buf[0] = av_buffer_create((uint8_t *) &avbuf->drm_frame,
-@@ -999,18 +1467,36 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
- 
-     atomic_fetch_add(&ctx->refcount, 1);
- 
--    frame->data[0] = (uint8_t *)v4l2_get_drm_frame(avbuf, ctx->orig_height);
-+    frame->data[0] = (uint8_t *)&avbuf->drm_frame;
-     frame->format = AV_PIX_FMT_DRM_PRIME;
-     if (ctx->hw_frames_ctx)
-         frame->hw_frames_ctx = av_buffer_ref(ctx->hw_frames_ctx);
--    frame->height = ctx->height;
--    frame->width = ctx->width;
--
--    // Not interlaced now
--    frame->interlaced_frame = 0;
--    frame->top_field_first = 0;
--    // Pkt duration halved
--    frame->pkt_duration /= 2;
-+    frame->height = ctx->output_height;
-+    frame->width = ctx->output_width;
-+
-+    color_primaries = get_color_primaries(&ctx->capture.format);
-+    colorspace      = get_color_space(&ctx->capture.format);
-+    color_trc       = get_color_trc(&ctx->capture.format);
-+    color_range     = get_color_range(&ctx->capture.format);
-+
-+    // If the color parameters are unspecified by V4L2 then leave alone as they
-+    // will have been copied from src
-+    if (color_primaries != AVCOL_PRI_UNSPECIFIED)
-+        frame->color_primaries = color_primaries;
-+    if (colorspace != AVCOL_SPC_UNSPECIFIED)
-+        frame->colorspace = colorspace;
-+    if (color_trc != AVCOL_TRC_UNSPECIFIED)
-+        frame->color_trc = color_trc;
-+    if (color_range != AVCOL_RANGE_UNSPECIFIED)
-+        frame->color_range = color_range;
-+
-+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE) {
-+        // Not interlaced now
-+        frame->interlaced_frame = 0;   // *** Fill in from dst buffer?
-+        frame->top_field_first = 0;
-+        // Pkt duration halved
-+        frame->pkt_duration /= 2;
-+    }
- 
-     if (avbuf->buffer.flags & V4L2_BUF_FLAG_ERROR) {
-         av_log(ctx->logctx, AV_LOG_ERROR, "driver decode error\n");
-@@ -1032,15 +1518,34 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
-     ctx->height = avctx->inputs[0]->h;
-     ctx->width = avctx->inputs[0]->w;
- 
--    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d\n", __func__, ctx->width, ctx->height);
-+    if (ctx->filter_type == FILTER_V4L2_SCALE) {
-+        if ((ret = ff_scale_eval_dimensions(priv,
-+                                            priv->w_expr, priv->h_expr,
-+                                            inlink, outlink,
-+                                            &ctx->output_width, &ctx->output_height)) < 0)
-+            return ret;
-+
-+        ff_scale_adjust_dimensions(inlink, &ctx->output_width, &ctx->output_height,
-+                                   priv->force_original_aspect_ratio, priv->force_divisible_by);
-+    }
-+    else {
-+        ctx->output_width  = ctx->width;
-+        ctx->output_height = ctx->height;
-+    }
-+
-+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);
- 
-     outlink->time_base           = inlink->time_base;
--    outlink->w                   = inlink->w;
--    outlink->h                   = inlink->h;
--    outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
-+    outlink->w                   = ctx->output_width;
-+    outlink->h                   = ctx->output_height;
-     outlink->format              = inlink->format;
-     outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
- 
-+    if (inlink->sample_aspect_ratio.num)
-+        outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
-+    else
-+        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;
-+
-     ret = deint_v4l2m2m_find_device(ctx);
-     if (ret)
-         return ret;
-@@ -1055,18 +1560,19 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
- 
- static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
- {
--    const int is_linear = (drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_LINEAR ||
--            drm_desc->objects[0].format_modifier == DRM_FORMAT_MOD_INVALID);
-+    const uint64_t mod = drm_desc->objects[0].format_modifier;
-+    const int is_linear = (mod == DRM_FORMAT_MOD_LINEAR || mod == DRM_FORMAT_MOD_INVALID);
-+
-+    // Only currently support single object things
-+    if (drm_desc->nb_objects != 1)
-+        return 0;
- 
-     switch (drm_desc->layers[0].format) {
-     case DRM_FORMAT_YUV420:
--        if (is_linear)
--            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_YUV420 : 0;
--        break;
-+        return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
-     case DRM_FORMAT_NV12:
--        if (is_linear)
--            return drm_desc->nb_objects == 1 ? V4L2_PIX_FMT_NV12 : 0;
--        break;
-+        return is_linear ? V4L2_PIX_FMT_NV12 :
-+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
-     default:
-         break;
-     }
-@@ -1089,7 +1595,7 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
- 
-     if (ctx->field_order == V4L2_FIELD_ANY) {
-         const AVDRMFrameDescriptor * const drm_desc = (AVDRMFrameDescriptor *)in->data[0];
--        const uint32_t pixelformat = desc_pixelformat(drm_desc);
-+        uint32_t pixelformat = desc_pixelformat(drm_desc);
- 
-         if (pixelformat == 0) {
-             av_log(avctx, AV_LOG_ERROR, "Unsupported DRM format %s in %d objects, modifier %#" PRIx64 "\n",
-@@ -1104,29 +1610,49 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-         av_log(priv, AV_LOG_DEBUG, "%s: %dx%d (%td,%td)\n", __func__, ctx->width, ctx->height,
-            drm_desc->layers[0].planes[0].pitch, drm_desc->layers[0].planes[1].offset);
- 
--        ret = deint_v4l2m2m_set_format(output, pixelformat, ctx->field_order, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
--        if (ret)
-+        if ((ret = set_src_fmt(output, in)) != 0) {
-+            av_log(avctx, AV_LOG_WARNING, "Unknown input DRM format: %s mod: %#" PRIx64 "\n",
-+                   av_fourcc2str(drm_desc->layers[0].format), drm_desc->objects[0].format_modifier);
-+            return ret;
-+        }
-+
-+        ret = do_s_fmt(output);
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed to set source format\n");
-             return ret;
-+        }
- 
--        ret = deint_v4l2m2m_set_format(capture, pixelformat, V4L2_FIELD_NONE, ctx->width, ctx->height, ctx->orig_width, drm_desc->layers[0].planes[1].offset);
--        if (ret)
-+        if (ctx->output_format != AV_PIX_FMT_NONE)
-+           pixelformat = fmt_av_to_v4l2(ctx->output_format);
-+        ret = set_dst_format(priv, capture, pixelformat, V4L2_FIELD_NONE, ctx->output_width, ctx->output_height);
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed to set destination format\n");
-             return ret;
-+        }
- 
-         ret = deint_v4l2m2m_allocate_buffers(capture);
--        if (ret)
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate destination buffers\n");
-             return ret;
-+        }
- 
-         ret = deint_v4l2m2m_streamon(capture);
--        if (ret)
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed set destination streamon: %s\n", av_err2str(ret));
-             return ret;
-+        }
- 
-         ret = deint_v4l2m2m_allocate_buffers(output);
--        if (ret)
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed to allocate src buffers\n");
-             return ret;
-+        }
- 
-         ret = deint_v4l2m2m_streamon(output);
--        if (ret)
-+        if (ret) {
-+            av_log(avctx, AV_LOG_WARNING, "Failed set src streamon: %s\n", av_err2str(ret));
-             return ret;
-+        }
- 
-         if (in->top_field_first)
-             ctx->field_order = V4L2_FIELD_INTERLACED_TB;
-@@ -1251,7 +1777,7 @@ again:
-     return did_something ? 0 : FFERROR_NOT_READY;
- }
- 
--static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
-+static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filter_type_v4l2_t filter_type)
- {
-     DeintV4L2M2MContext * const priv = avctx->priv;
-     DeintV4L2M2MContextShared * const ctx = av_mallocz(sizeof(DeintV4L2M2MContextShared));
-@@ -1262,6 +1788,7 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
-     }
-     priv->shared = ctx;
-     ctx->logctx = priv;
-+    ctx->filter_type = filter_type;
-     ctx->fd = -1;
-     ctx->output.ctx = ctx;
-     ctx->output.num_buffers = 8;
-@@ -1274,9 +1801,52 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
- 
-     atomic_init(&ctx->refcount, 1);
- 
-+    if (priv->output_format_string) {
-+        ctx->output_format = av_get_pix_fmt(priv->output_format_string);
-+        if (ctx->output_format == AV_PIX_FMT_NONE) {
-+            av_log(avctx, AV_LOG_ERROR, "Invalid ffmpeg output format '%s'.\n", priv->output_format_string);
-+            return AVERROR(EINVAL);
-+        }
-+        if (fmt_av_to_v4l2(ctx->output_format) == 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Unsupported output format for V4L2: %s.\n", av_get_pix_fmt_name(ctx->output_format));
-+            return AVERROR(EINVAL);
-+        }
-+    } else {
-+        // Use the input format once that is configured.
-+        ctx->output_format = AV_PIX_FMT_NONE;
-+    }
-+
-+#define STRING_OPTION(var_name, func_name, default_value) do { \
-+        if (priv->var_name ## _string) { \
-+            int var = av_ ## func_name ## _from_name(priv->var_name ## _string); \
-+            if (var < 0) { \
-+                av_log(avctx, AV_LOG_ERROR, "Invalid %s.\n", #var_name); \
-+                return AVERROR(EINVAL); \
-+            } \
-+            priv->var_name = var; \
-+        } else { \
-+            priv->var_name = default_value; \
-+        } \
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define IDCTDSP_TEST(func) { #func, offsetof(IDCTDSPContext, func) },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++} test;
++
++#define RANDOMIZE_BUFFER16(name, size)          \
++    do {                                        \
++        int i;                                  \
++        for (i = 0; i < size; ++i) {            \
++            uint16_t r = rnd() % 0x201 - 0x100; \
++            AV_WN16A(name##0 + i, r);           \
++            AV_WN16A(name##1 + i, r);           \
++        }                                       \
 +    } while (0)
 +
-+    STRING_OPTION(colour_primaries, color_primaries, AVCOL_PRI_UNSPECIFIED);
-+    STRING_OPTION(colour_transfer,  color_transfer,  AVCOL_TRC_UNSPECIFIED);
-+    STRING_OPTION(colour_matrix,    color_space,     AVCOL_SPC_UNSPECIFIED);
-+    STRING_OPTION(chroma_location,  chroma_location, AVCHROMA_LOC_UNSPECIFIED);
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
 +
-     return 0;
- }
- 
-+static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
++static void check_add_put_clamped(void)
 +{
-+    return common_v4l2m2m_init(avctx, FILTER_V4L2_DEINTERLACE);
-+}
-+
-+static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
-+{
-+    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
-+}
-+
- static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
- {
-     DeintV4L2M2MContext *priv = avctx->priv;
-@@ -1294,6 +1864,51 @@ static const AVOption deinterlace_v4l2m2m_options[] = {
- 
- AVFILTER_DEFINE_CLASS(deinterlace_v4l2m2m);
- 
-+#define OFFSET(x) offsetof(DeintV4L2M2MContext, x)
-+#define FLAGS (AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM)
-+
-+static const AVOption scale_v4l2m2m_options[] = {
-+    { "w", "Output video width",
-+      OFFSET(w_expr), AV_OPT_TYPE_STRING, {.str = "iw"}, .flags = FLAGS },
-+    { "h", "Output video height",
-+      OFFSET(h_expr), AV_OPT_TYPE_STRING, {.str = "ih"}, .flags = FLAGS },
-+    { "format", "Output video format (software format of hardware frames)",
-+      OFFSET(output_format_string), AV_OPT_TYPE_STRING, .flags = FLAGS },
-+      // These colour properties match the ones of the same name in vf_scale.
-+      { "out_color_matrix", "Output colour matrix coefficient set",
-+      OFFSET(colour_matrix_string), AV_OPT_TYPE_STRING, { .str = NULL }, .flags = FLAGS },
-+    { "out_range", "Output colour range",
-+      OFFSET(colour_range), AV_OPT_TYPE_INT, { .i64 = AVCOL_RANGE_UNSPECIFIED },
-+      AVCOL_RANGE_UNSPECIFIED, AVCOL_RANGE_JPEG, FLAGS, "range" },
-+        { "full",    "Full range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
-+        { "limited", "Limited range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
-+        { "jpeg",    "Full range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
-+        { "mpeg",    "Limited range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
-+        { "tv",      "Limited range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_MPEG }, 0, 0, FLAGS, "range" },
-+        { "pc",      "Full range",
-+          0, AV_OPT_TYPE_CONST, { .i64 = AVCOL_RANGE_JPEG }, 0, 0, FLAGS, "range" },
-+    // These colour properties match the ones in the VAAPI scaler
-+    { "out_color_primaries", "Output colour primaries",
-+      OFFSET(colour_primaries_string), AV_OPT_TYPE_STRING,
-+      { .str = NULL }, .flags = FLAGS },
-+    { "out_color_transfer", "Output colour transfer characteristics",
-+      OFFSET(colour_transfer_string),  AV_OPT_TYPE_STRING,
-+      { .str = NULL }, .flags = FLAGS },
-+    { "out_chroma_location", "Output chroma sample location",
-+      OFFSET(chroma_location_string),  AV_OPT_TYPE_STRING,
-+      { .str = NULL }, .flags = FLAGS },
-+    { "force_original_aspect_ratio", "decrease or increase w/h if necessary to keep the original AR", OFFSET(force_original_aspect_ratio), AV_OPT_TYPE_INT, { .i64 = 0}, 0, 2, FLAGS, "force_oar" },
-+    { "force_divisible_by", "enforce that the output resolution is divisible by a defined integer when force_original_aspect_ratio is used", OFFSET(force_divisible_by), AV_OPT_TYPE_INT, { .i64 = 1}, 1, 256, FLAGS },
-+    { NULL },
-+};
-+
-+AVFILTER_DEFINE_CLASS(scale_v4l2m2m);
-+
- static const AVFilterPad deint_v4l2m2m_inputs[] = {
-     {
-         .name         = "default",
-@@ -1321,3 +1936,17 @@ AVFilter ff_vf_deinterlace_v4l2m2m = {
-     .priv_class     = &deinterlace_v4l2m2m_class,
-     .activate       = deint_v4l2m2m_activate,
- };
-+
-+AVFilter ff_vf_scale_v4l2m2m = {
-+    .name           = "scale_v4l2m2m",
-+    .description    = NULL_IF_CONFIG_SMALL("V4L2 M2M scaler"),
-+    .priv_size      = sizeof(DeintV4L2M2MContext),
-+    .init           = &scale_v4l2m2m_init,
-+    .uninit         = &deint_v4l2m2m_uninit,
-+    FILTER_INPUTS(deint_v4l2m2m_inputs),
-+    FILTER_OUTPUTS(deint_v4l2m2m_outputs),
-+    FILTER_SINGLE_SAMPLEFMT(AV_PIX_FMT_DRM_PRIME),
-+    .priv_class     = &scale_v4l2m2m_class,
-+    .activate       = deint_v4l2m2m_activate,
-+};
-+
-
-From 7e7147d50bc6e3f13834525dba3a47d170422f07 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 22 Sep 2022 14:54:46 +0000
-Subject: [PATCH 071/136] v4l2_m2m: Adjust buffer allocation based on min/max
- controls
-
-Clip requested buffer count to min/max declared by driver.
-If 0 buffers requested then set to min+2.
-This allows encode to keep its src buffer count down to a plausible
-minimum which helps with flow control.
----
- libavcodec/v4l2_context.c | 19 +++++++++++++++++++
- 1 file changed, 19 insertions(+)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 6b97eab41e..ba36689ff3 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -1187,6 +1187,7 @@ fail_release:
- 
- int ff_v4l2_context_init(V4L2Context* ctx)
- {
-+    struct v4l2_queryctrl qctrl;
-     V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
-     int ret;
- 
-@@ -1228,6 +1229,24 @@ int ff_v4l2_context_init(V4L2Context* ctx)
-         goto fail_unref_hwframes;
-     }
- 
-+    memset(&qctrl, 0, sizeof(qctrl));
-+    qctrl.id = V4L2_CID_MIN_BUFFERS_FOR_OUTPUT;
-+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &qctrl) != 0) {
-+        ret = AVERROR(errno);
-+        if (ret != AVERROR(EINVAL)) {
-+            av_log(logger(ctx), AV_LOG_ERROR, "%s VIDIOC_QUERCTRL failed: %s\n", ctx->name, av_err2str(ret));
-+            goto fail_unref_hwframes;
-+        }
-+        // Control unsupported - set default if wanted
-+        if (ctx->num_buffers < 2)
-+            ctx->num_buffers = 4;
-+    }
-+    else {
-+        if (ctx->num_buffers < 2)
-+            ctx->num_buffers = qctrl.minimum + 2;
-+        ctx->num_buffers = av_clip(ctx->num_buffers, qctrl.minimum, qctrl.maximum);
-+    }
-+
-     ret = create_buffers(ctx, ctx->num_buffers, ctx->buf_mem);
-     if (ret < 0)
-         goto fail_unref_hwframes;
-
-From b69a2707a192ac509174899233a094373a3f5dc9 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 22 Sep 2022 15:00:12 +0000
-Subject: [PATCH 072/136] v4l2_m2m_dec: If src Q is full then wait indefinitely
- for buffer
-
-If it is not possible to add another buffer to the src Q then alawys
-wait indefinitely for either an output frame or the Q to have space.
-
-This has issues if the reason that the Q is stalled is due to dst buffer
-exhaustion and buffers cannot be returned async by another thread but
-the current scheme confuses ffmpegs pipeline scheduling.
----
- libavcodec/v4l2_m2m_dec.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 485a96f4b4..bb183097f6 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -456,9 +456,9 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         if (dst_rv != 0 && TRY_DQ(src_rv)) {
-             // Pick a timeout depending on state
-             const int t =
-+                src_rv == NQ_Q_FULL ? -1 :
-                 src_rv == NQ_DRAINING ? 300 :
--                prefer_dq ? 5 :
--                src_rv == NQ_Q_FULL ? -1 : 0;
-+                prefer_dq ? 5 : 0;
- 
-             // Dequeue frame will unref any previous contents of frame
-             // if it returns success so we don't need an explicit unref
-
-From b1d37be81bbf683a0eb16923c9b9f045fd0ea0c0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 22 Sep 2022 15:12:27 +0000
-Subject: [PATCH 073/136] vf_deinterlace_v4l2m2m: Add Q name to structure for
- debug
-
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 2df39ec0f1..4edecc02bf 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -84,6 +84,7 @@ typedef struct V4L2Queue {
-     struct v4l2_selection sel;
-     int num_buffers;
-     V4L2Buffer *buffers;
-+    const char * name;
-     DeintV4L2M2MContextShared *ctx;
- } V4L2Queue;
- 
-@@ -1792,8 +1793,10 @@ static av_cold int common_v4l2m2m_init(AVFilterContext * const avctx, const filt
-     ctx->fd = -1;
-     ctx->output.ctx = ctx;
-     ctx->output.num_buffers = 8;
-+    ctx->output.name = "OUTPUT";
-     ctx->capture.ctx = ctx;
-     ctx->capture.num_buffers = 12;
-+    ctx->capture.name = "CAPTURE";
-     ctx->done = 0;
-     ctx->field_order = V4L2_FIELD_ANY;
- 
-
-From 794a5bfc3ec74fdc7664508a287a075708d5deef Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 22 Sep 2022 16:08:42 +0000
-Subject: [PATCH 074/136] v4l2_m2m_enc: Set src buffer count to min+2 by
- default
-
-Set output.num_buffers to 0 by default which will then be set to min+2
-by the allocation code. This fixes an issue where the deinterlacer had
-fewer dest buffer than the encoder has src buffers and so ran dry
-creating deadlock in the ffmpeg filter chain.
----
- libavcodec/v4l2_m2m_enc.c | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index 099ad23928..b8ba815c37 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -672,9 +672,10 @@ static av_cold int v4l2_encode_close(AVCodecContext *avctx)
- #define FLAGS AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
- 
- #define V4L_M2M_CAPTURE_OPTS \
--    V4L_M2M_DEFAULT_OPTS,\
-+    { "num_output_buffers", "Number of buffers in the output context",\
-+        OFFSET(num_output_buffers), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, FLAGS },\
-     { "num_capture_buffers", "Number of buffers in the capture context", \
--        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 4 }, 4, INT_MAX, FLAGS }
-+        OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 8 }, 8, INT_MAX, FLAGS }
- 
- static const AVOption mpeg4_options[] = {
-     V4L_M2M_CAPTURE_OPTS,
-
-From 85c42743046a05b347f33b1933e6d52ea1d17e00 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 22 Sep 2022 16:13:57 +0000
-Subject: [PATCH 075/136] vf_deinterlace_m2m: For deinterlace set outlink FR to
- twice inlink
-
-We used to set the outlink framerate to unknown but it turns out that
-ffmpegs filter pipeline copes with that badly. Otherwise leave at 0,0
-which will copy FR from inlink to outlink.
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 7 +++++--
- 1 file changed, 5 insertions(+), 2 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 4edecc02bf..c52dae1c44 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -1534,13 +1534,16 @@ static int deint_v4l2m2m_config_props(AVFilterLink *outlink)
-         ctx->output_height = ctx->height;
-     }
- 
--    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d\n", __func__, ctx->width, ctx->height, ctx->output_width, ctx->output_height);
-+    av_log(priv, AV_LOG_DEBUG, "%s: %dx%d->%dx%d FR: %d/%d->%d/%d\n", __func__,
-+           ctx->width, ctx->height, ctx->output_width, ctx->output_height,
-+           inlink->frame_rate.num, inlink->frame_rate.den, outlink->frame_rate.num, outlink->frame_rate.den);
- 
-     outlink->time_base           = inlink->time_base;
-     outlink->w                   = ctx->output_width;
-     outlink->h                   = ctx->output_height;
-     outlink->format              = inlink->format;
--    outlink->frame_rate = (AVRational) {1, 0};  // Deny knowledge of frame rate
-+    if (ctx->filter_type == FILTER_V4L2_DEINTERLACE && inlink->frame_rate.den != 0)
-+        outlink->frame_rate = (AVRational){inlink->frame_rate.num * 2, inlink->frame_rate.den};
- 
-     if (inlink->sample_aspect_ratio.num)
-         outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h * inlink->w, outlink->w * inlink->h}, inlink->sample_aspect_ratio);
-
-From 34a24bc0b0d427c75659d3907cb75afb6a9dc255 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 23 Sep 2022 11:30:56 +0000
-Subject: [PATCH 076/136] v4l2m2m: Add ff_v4l2_dq_all to drain all buffers from
- a Q
-
-Useful for where (encode) we might have drmprime buffers that we want to
-return to the source ASAP.
----
- libavcodec/v4l2_context.c | 17 +++++++++++------
- libavcodec/v4l2_context.h |  2 ++
- 2 files changed, 13 insertions(+), 6 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index ba36689ff3..4a359bf45e 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -707,17 +707,22 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
-     return avbuf;
- }
- 
-+void
-+ff_v4l2_dq_all(V4L2Context *const ctx)
-+{
-+    V4L2Buffer * avbuf;
-+    do {
-+        get_qbuf(ctx, &avbuf, 0);
-+    } while (avbuf);
-+}
-+
- static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- {
-     int i;
- 
-     /* get back as many output buffers as possible */
--    if (V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--        V4L2Buffer * avbuf;
--        do {
--            get_qbuf(ctx, &avbuf, 0);
--        } while (avbuf);
--    }
-+    if (V4L2_TYPE_IS_OUTPUT(ctx->type))
-+        ff_v4l2_dq_all(ctx);
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
-         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 21265f1bd7..523c53e97d 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -218,4 +218,6 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
-  */
- int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
- 
-+void ff_v4l2_dq_all(V4L2Context *const ctx);
-+
- #endif // AVCODEC_V4L2_CONTEXT_H
-
-From 95dfc168c74f7b0f282c1b2ad9deb8fba10a7ce5 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 23 Sep 2022 11:38:36 +0000
-Subject: [PATCH 077/136] v4l2_m2m_enc: DQ output more frequently
-
-Ensure that we DQ any released src buffers on every op to avoid deadlock
-with source.
-
-There is a plausible argument that this patch is inelegant and the drain
-should be integrated into dq_buf, but that is a further reaching delta.
----
- libavcodec/v4l2_m2m_enc.c | 12 ++++++++++--
- 1 file changed, 10 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index b8ba815c37..a992a3cccc 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -421,6 +421,8 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const output = &s->output;
- 
-+    ff_v4l2_dq_all(output);
-+
-     // Signal EOF if needed
-     if (!frame) {
-         return ff_v4l2_context_enqueue_frame(output, frame);
-@@ -492,6 +494,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     AVFrame *frame = s->frame;
-     int ret;
- 
-+    ff_v4l2_dq_all(output);
-+
-     if (s->draining)
-         goto dequeue;
- 
-@@ -528,7 +532,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     }
- 
- dequeue:
--    if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-+    ff_v4l2_dq_all(output);
-+    if (ret)
-         return ret;
- 
-     if (capture->first_buf == 1) {
-@@ -560,7 +566,9 @@ dequeue:
-             s->extdata_size = len;
-         }
- 
--        if ((ret = ff_v4l2_context_dequeue_packet(capture, avpkt)) != 0)
-+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
-+        ff_v4l2_dq_all(output);
-+        if (ret)
-             return ret;
-     }
- 
-
-From a40b1c38b0615fce0c0d9eb97510ab9e77b3e1ac Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 26 Sep 2022 18:20:00 +0100
-Subject: [PATCH 078/136] conf_native: Remove --enable-rpi from all builds
-
----
- pi-util/conf_native.sh | 5 +++--
- 1 file changed, 3 insertions(+), 2 deletions(-)
-
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-index 37cea71756..f22d531ca4 100755
---- a/pi-util/conf_native.sh
-+++ b/pi-util/conf_native.sh
-@@ -54,9 +54,9 @@ if [ $MMAL ]; then
-   RPI_LIBDIRS="-L$RPI_OPT_VC/lib"
-   RPI_DEFINES="$RPI_DEFINES -D__VCCOREVER__=0x4000000"
-   RPI_EXTRALIBS="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm -Wl,--end-group"
--  RPIOPTS="--enable-mmal --enable-rpi"
-+  RPIOPTS="--enable-mmal"
- else
--  RPIOPTS="--disable-mmal --enable-sand"
-+  RPIOPTS="--disable-mmal"
- fi
- 
- C=`lsb_release -sc`
-@@ -89,6 +89,7 @@ $FFSRC/configure \
-  $MCOPTS\
-  --disable-stripping\
-  --disable-thumb\
-+ --enable-sand\
-  --enable-v4l2-request\
-  --enable-libdrm\
-  --enable-vout-egl\
-
-From 8fddfc8f1e3c95caded18705ed29be0ae95517bc Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 29 Sep 2022 19:48:08 +0000
-Subject: [PATCH 079/136] v4l2_m2m_dec: Deal correctly with avcC H264 data in
- extradata
-
-Decoders expect AnnexB style headers, mkv and similar formats have
-somewhat oddly wrapped extradata. Convert to annex-b style before use.
----
- libavcodec/v4l2_m2m.h     |   2 +-
- libavcodec/v4l2_m2m_dec.c | 177 ++++++++++++++++++++++++++++++++++++--
- 2 files changed, 169 insertions(+), 10 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index ee72beb052..babf101d65 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -118,7 +118,7 @@ typedef struct V4L2m2mContext {
-     /* Ext data sent */
-     int extdata_sent;
-     /* Ext data sent in packet - overrides ctx */
--    uint8_t * extdata_data;
-+    void * extdata_data;
-     size_t extdata_size;
- 
- #define FF_V4L2_QUIRK_REINIT_ALWAYS             1
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index bb183097f6..6bd9926b3f 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -46,6 +46,71 @@
- #define STATS_LAST_COUNT_MAX 64
- #define STATS_INTERVAL_MAX (1 << 30)
- 
-+#ifndef FF_API_BUFFER_SIZE_T
-+#define FF_API_BUFFER_SIZE_T 1
-+#endif
-+
-+#define DUMP_FAILED_EXTRADATA 0
-+
-+#if DUMP_FAILED_EXTRADATA
-+static inline char hex1(unsigned int x)
-+{
-+    x &= 0xf;
-+    return x <= 9 ? '0' + x : 'a' + x - 10;
-+}
-+
-+static inline char * hex2(char * s, unsigned int x)
-+{
-+    *s++ = hex1(x >> 4);
-+    *s++ = hex1(x);
-+    return s;
-+}
-+
-+static inline char * hex4(char * s, unsigned int x)
-+{
-+    s = hex2(s, x >> 8);
-+    s = hex2(s, x);
-+    return s;
-+}
-+
-+static inline char * dash2(char * s)
-+{
-+    *s++ = '-';
-+    *s++ = '-';
-+    return s;
-+}
-+
-+static void
-+data16(char * s, const unsigned int offset, const uint8_t * m, const size_t len)
-+{
-+    size_t i;
-+    s = hex4(s, offset);
-+    m += offset;
-+    for (i = 0; i != 8; ++i) {
-+        *s++ = ' ';
-+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
-+    }
-+    *s++ = ' ';
-+    *s++ = ':';
-+    for (; i != 16; ++i) {
-+        *s++ = ' ';
-+        s = len > i + offset ? hex2(s, *m++) : dash2(s);
-+    }
-+    *s++ = 0;
-+}
-+
-+static void
-+log_dump(void * logctx, int lvl, const void * const data, const size_t len)
-+{
-+    size_t i;
-+    for (i = 0; i < len; i += 16) {
-+        char buf[80];
-+        data16(buf, i, data, len);
-+        av_log(logctx, lvl, "%s\n", buf);
-+    }
-+}
-+#endif
-+
- static int64_t pts_stats_guess(const pts_stats_t * const stats)
- {
-     if (stats->last_pts == AV_NOPTS_VALUE ||
-@@ -98,6 +163,98 @@ static void pts_stats_init(pts_stats_t * const stats, void * logctx, const char
-     };
- }
- 
-+// If abdata == NULL then this just counts space required
-+// Unpacks avcC if detected
-+static int
-+h264_xd_copy(const uint8_t * const extradata, const int extrasize, uint8_t * abdata)
-+{
-+    const uint8_t * const xdend = extradata + extrasize;
-+    const uint8_t * p = extradata;
-+    uint8_t * d = abdata;
-+    unsigned int n;
-+    unsigned int len;
-+    const unsigned int hdrlen = 4;
-+    unsigned int need_pps = 1;
-+
-+    if (extrasize < 8)
-+        return AVERROR(EINVAL);
-+
-+    if (p[0] == 0 && p[1] == 0) {
-+        // Assume a couple of leading zeros are good enough to indicate NAL
-+        if (abdata)
-+            memcpy(d, p, extrasize);
-+        return extrasize;
-+    }
-+
-+    // avcC starts with a 1
-+    if (p[0] != 1)
-+        return AVERROR(EINVAL);
-+
-+    p += 5;
-+    n = *p++ & 0x1f;
-+
-+doxps:
-+    while (n--) {
-+        if (xdend - p < 2)
-+            return AVERROR(EINVAL);
-+        len = (p[0] << 8) | p[1];
-+        p += 2;
-+        if (xdend - p < (ptrdiff_t)len)
-+            return AVERROR(EINVAL);
-+        if (abdata) {
-+            d[0] = 0;
-+            d[1] = 0;
-+            d[2] = 0;
-+            d[3] = 1;
-+            memcpy(d + 4, p, len);
-+        }
-+        d += len + hdrlen;
-+        p += len;
-+    }
-+    if (need_pps) {
-+        need_pps = 0;
-+        if (p >= xdend)
-+            return AVERROR(EINVAL);
-+        n = *p++;
-+        goto doxps;
-+    }
-+
-+    return d - abdata;
-+}
-+
-+static int
-+copy_extradata(AVCodecContext * const avctx,
-+               const void * const src_data, const int src_len,
-+               void ** const pdst_data, size_t * const pdst_len)
-+{
-+    int len;
-+
-+    *pdst_len = 0;
-+    av_freep(pdst_data);
-+
-+    if (avctx->codec_id == AV_CODEC_ID_H264)
-+        len = h264_xd_copy(src_data, src_len, NULL);
-+    else
-+        len = src_len < 0 ? AVERROR(EINVAL) : src_len;
-+
-+    // Zero length is OK but we swant to stop - -ve is error val
-+    if (len <= 0)
-+        return len;
-+
-+    if ((*pdst_data = av_malloc(len + AV_INPUT_BUFFER_PADDING_SIZE)) == NULL)
-+        return AVERROR(ENOMEM);
-+
-+    if (avctx->codec_id == AV_CODEC_ID_H264)
-+        h264_xd_copy(src_data, src_len, *pdst_data);
-+    else
-+        memcpy(*pdst_data, src_data, len);
-+    *pdst_len = len;
-+
-+    return 0;
-+}
-+
-+
-+
- static int check_output_streamon(AVCodecContext *const avctx, V4L2m2mContext *const s)
- {
-     int ret;
-@@ -277,13 +434,8 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-             side_data = av_packet_get_side_data(&s->buf_pkt, AV_PKT_DATA_NEW_EXTRADATA, &side_size);
-             if (side_data) {
-                 av_log(avctx, AV_LOG_DEBUG, "New extradata\n");
--                av_freep(&s->extdata_data);
--                if ((s->extdata_data = av_malloc(side_size ? side_size : 1)) == NULL) {
--                    av_log(avctx, AV_LOG_ERROR, "Failed to alloc %zd bytes of extra data\n", side_size);
--                    return AVERROR(ENOMEM);
--                }
--                memcpy(s->extdata_data, side_data, side_size);
--                s->extdata_size = side_size;
-+                if ((ret = copy_extradata(avctx, side_data, (int)side_size, &s->extdata_data, &s->extdata_size)) < 0)
-+                    av_log(avctx, AV_LOG_WARNING, "Failed to copy new extra data: %s\n", av_err2str(ret));
-                 s->extdata_sent = 0;
-             }
- 
-@@ -359,8 +511,6 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
-         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
-     else if (s->extdata_data)
-         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
--    else
--        ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, avctx->extradata, avctx->extradata_size);
- 
-     if (ret == AVERROR(EAGAIN)) {
-         // Out of input buffers - keep packet
-@@ -770,6 +920,15 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         return ret;
-     }
- 
-+    if (avctx->extradata &&
-+        (ret = copy_extradata(avctx, avctx->extradata, avctx->extradata_size, &s->extdata_data, &s->extdata_size)) != 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to copy extradata from context: %s\n", av_err2str(ret));
-+#if DUMP_FAILED_EXTRADATA
-+        log_dump(avctx, AV_LOG_INFO, avctx->extradata, avctx->extradata_size);
-+#endif
-+        return ret;
-+    }
-+
-     if ((ret = v4l2_prepare_decoder(s)) < 0)
-         return ret;
- 
-
-From 70227ebbc2999bc49075a3b683392d94618ecd89 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 30 Sep 2022 14:20:23 +0000
-Subject: [PATCH 080/136] v4l2_request_hevc: Fix up
- V4L2_CID_CODEC_STATELESS_BASE if missing
-
----
- libavcodec/hevc-ctrls-v4.h | 7 +++++++
- 1 file changed, 7 insertions(+)
-
-diff --git a/libavcodec/hevc-ctrls-v4.h b/libavcodec/hevc-ctrls-v4.h
-index 7829d82084..c02fdbe5a8 100644
---- a/libavcodec/hevc-ctrls-v4.h
-+++ b/libavcodec/hevc-ctrls-v4.h
-@@ -53,6 +53,13 @@
- #include <linux/const.h>
- #include <linux/types.h>
- 
-+#ifndef V4L2_CTRL_CLASS_CODEC_STATELESS
-+#define V4L2_CTRL_CLASS_CODEC_STATELESS 0x00a40000	/* Stateless codecs controls */
-+#endif
-+#ifndef V4L2_CID_CODEC_STATELESS_BASE
-+#define V4L2_CID_CODEC_STATELESS_BASE		(V4L2_CTRL_CLASS_CODEC_STATELESS | 0x900)
-+#endif
-+
- #define V4L2_PIX_FMT_HEVC_SLICE v4l2_fourcc('S', '2', '6', '5') /* HEVC parsed slices */
- 
- #define V4L2_CID_STATELESS_HEVC_SPS		(V4L2_CID_CODEC_STATELESS_BASE + 400)
-
-From 22d2000382839dbd04588af1bb20cc9d9b3a4362 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Sat, 1 Oct 2022 13:40:57 +0000
-Subject: [PATCH 081/136] vf_deinterlace_v4l2m2m: Fix compile on m/c without
- V4L2 SAND
-
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 33 +++++++++++++++++++++++-----
- 1 file changed, 28 insertions(+), 5 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index c52dae1c44..716789f988 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -35,6 +35,8 @@
- #include <sys/mman.h>
- #include <unistd.h>
- 
-+#include "config.h"
-+
- #include "libavutil/avassert.h"
- #include "libavutil/avstring.h"
- #include "libavutil/common.h"
-@@ -59,6 +61,16 @@
- #define DRM_FORMAT_P030 fourcc_code('P', '0', '3', '0') /* 2x2 subsampled Cr:Cb plane 10 bits per channel packed */
- #endif
- 
-+// V4L2_PIX_FMT_NV12_10_COL128 and V4L2_PIX_FMT_NV12_COL128 should be defined
-+// in drm_fourcc.h hopefully will be sometime in the future but until then...
-+#ifndef V4L2_PIX_FMT_NV12_10_COL128
-+#define V4L2_PIX_FMT_NV12_10_COL128 v4l2_fourcc('N', 'C', '3', '0')
-+#endif
-+
-+#ifndef V4L2_PIX_FMT_NV12_COL128
-+#define V4L2_PIX_FMT_NV12_COL128 v4l2_fourcc('N', 'C', '1', '2') /* 12  Y/CbCr 4:2:0 128 pixel wide column */
-+#endif
-+
- typedef struct V4L2Queue V4L2Queue;
- typedef struct DeintV4L2M2MContextShared DeintV4L2M2MContextShared;
- 
-@@ -176,9 +188,11 @@ fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
-         return V4L2_PIX_FMT_YUV420;
-     case AV_PIX_FMT_NV12:
-         return V4L2_PIX_FMT_NV12;
-+#if CONFIG_SAND
-     case AV_PIX_FMT_RPI4_8:
-     case AV_PIX_FMT_SAND128:
-         return V4L2_PIX_FMT_NV12_COL128;
-+#endif
-     default:
-         break;
-     }
-@@ -193,8 +207,10 @@ fmt_v4l2_to_av(const uint32_t pixfmt)
-         return AV_PIX_FMT_YUV420P;
-     case V4L2_PIX_FMT_NV12:
-         return AV_PIX_FMT_NV12;
-+#if CONFIG_SAND
-     case V4L2_PIX_FMT_NV12_COL128:
-         return AV_PIX_FMT_RPI4_8;
-+#endif
-     default:
-         break;
-     }
-@@ -823,6 +839,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
-                 h = src->layers[0].planes[1].offset / bpl;
-                 w = bpl;
-             }
-+#if CONFIG_SAND
-             else if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-                 if (src->layers[0].nb_planes != 2)
-                     break;
-@@ -831,9 +848,11 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
-                 h = src->layers[0].planes[1].offset / 128;
-                 bpl = fourcc_mod_broadcom_param(mod);
-             }
-+#endif
-             break;
- 
-         case DRM_FORMAT_P030:
-+#if CONFIG_SAND
-             if (fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128) {
-                 if (src->layers[0].nb_planes != 2)
-                     break;
-@@ -842,6 +861,7 @@ static int set_src_fmt(V4L2Queue * const q, const AVFrame * const frame)
-                 h = src->layers[0].planes[1].offset / 128;
-                 bpl = fourcc_mod_broadcom_param(mod);
-             }
-+#endif
-             break;
- 
-         default:
-@@ -1048,7 +1068,6 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
-     AVDRMLayerDescriptor * const layer = &drm_desc->layers[0];
-     const struct v4l2_format *const fmt = &q->format;
-     const uint32_t height = fmt_height(fmt);
--    const uint32_t width  = fmt_width(fmt);
-     ptrdiff_t bpl0;
- 
-     /* fill the DRM frame descriptor */
-@@ -1063,7 +1082,7 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
-     bpl0 = layer->planes[0].pitch;
- 
-     switch (fmt_pixelformat(fmt)) {
--
-+#if CONFIG_SAND
-         case V4L2_PIX_FMT_NV12_COL128:
-             mod = DRM_FORMAT_MOD_BROADCOM_SAND128_COL_HEIGHT(bpl0);
-             layer->format = V4L2_PIX_FMT_NV12;
-@@ -1074,9 +1093,10 @@ static int v4l2_buffer_export_drm(V4L2Queue * const q, V4L2Buffer * const avbuf)
-             layer->nb_planes = 2;
-             layer->planes[1].object_index = 0;
-             layer->planes[1].offset = height * 128;
--            layer->planes[0].pitch = width;
--            layer->planes[1].pitch = width;
-+            layer->planes[0].pitch = fmt_width(fmt);
-+            layer->planes[1].pitch = layer->planes[0].pitch;
-             break;
-+#endif
- 
-         case DRM_FORMAT_NV12:
-             layer->format = V4L2_PIX_FMT_NV12;
-@@ -1576,7 +1596,10 @@ static uint32_t desc_pixelformat(const AVDRMFrameDescriptor * const drm_desc)
-         return is_linear ? V4L2_PIX_FMT_YUV420 : 0;
-     case DRM_FORMAT_NV12:
-         return is_linear ? V4L2_PIX_FMT_NV12 :
--            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 : 0;
-+#if CONFIG_SAND
-+            fourcc_mod_broadcom_mod(mod) == DRM_FORMAT_MOD_BROADCOM_SAND128 ? V4L2_PIX_FMT_NV12_COL128 :
-+#endif
-+            0;
-     default:
-         break;
-     }
-
-From f06f9ee41bf0f6f74240503f0cb427328cf6792f Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Sun, 2 Oct 2022 12:36:43 +0000
-Subject: [PATCH 082/136] configure: Fix v4l2_req_hevc_vx setup; set after deps
- fixups
-
----
- configure | 9 +++------
- 1 file changed, 3 insertions(+), 6 deletions(-)
-
-diff --git a/configure b/configure
-index 5c00a183e3..94c8161b91 100755
---- a/configure
-+++ b/configure
-@@ -6914,12 +6914,6 @@ fi
- check_func_headers "linux/media.h linux/videodev2.h" v4l2_timeval_to_ns
- check_cc hevc_v4l2_request linux/videodev2.h "int i = V4L2_PIX_FMT_HEVC_SLICE;"
- disable v4l2_req_hevc_vx
--if enabled hevc_v4l2request_hwaccel; then
--    enable v4l2_req_hevc_vx
--fi
--if enabled hevc_v4l2_request; then
--    disable v4l2_req_hevc_vx
--fi
- 
- check_headers sys/videoio.h
- test_code cc sys/videoio.h "struct v4l2_frmsizeenum vfse; vfse.discrete.width = 0;" && enable_sanitized struct_v4l2_frmivalenum_discrete
-@@ -7415,6 +7409,9 @@ check_deps $CONFIG_LIST       \
- 
- enabled threads && ! enabled pthreads && ! enabled atomics_native && die "non pthread threading without atomics not supported, try adding --enable-pthreads or --cpu=i486 or higher if you are on x86"
- 
-+# Sub-feature of hevc_v4l2request_hwaccel - can only be set once deps are done
-+enabled hevc_v4l2request_hwaccel && disabled hevc_v4l2_request && enable v4l2_req_hevc_vx
-+
- case $target_os in
- haiku)
-     disable memalign
-
-From 7d7709fb68561711f893269227147974fd6a46f3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Sat, 1 Oct 2022 12:39:45 +0000
-Subject: [PATCH 083/136] vf_deinterlace_v4l2m2m: Ensure we get consistent
- final frames
-
-On getting EOS at the input of the filster do not simply drop everything
-in transit on the floor but attempt to retrieve everything possible from
-the capture Q before on-signalling EOS.
-If we know that we expect 1 frame in to always produce 1 frame out then
-match CAPTURE frame to the last OUTPUT frame Qed (scale)
-If frames out have an unknown relation to source frames (deinterlace) try
-an encode stop and wait for the last frame marker to emerge from CAPTURE
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 172 +++++++++++++++++++++++----
- 1 file changed, 148 insertions(+), 24 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 716789f988..ce875c2c61 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -94,6 +94,7 @@ typedef struct V4L2Buffer {
- typedef struct V4L2Queue {
-     struct v4l2_format format;
-     struct v4l2_selection sel;
-+    int eos;
-     int num_buffers;
-     V4L2Buffer *buffers;
-     const char * name;
-@@ -127,20 +128,41 @@ typedef struct pts_track_s
-     pts_track_el_t a[PTS_TRACK_SIZE];
- } pts_track_t;
- 
-+typedef enum drain_state_e
-+{
-+    DRAIN_NONE = 0,     // Not draining
-+    DRAIN_TIMEOUT,      // Drain until normal timeout setup yields no frame
-+    DRAIN_LAST,         // Drain with long timeout last_frame in received on output expected
-+    DRAIN_EOS,          // Drain with long timeout EOS expected
-+    DRAIN_DONE          // Drained
-+} drain_state_t;
-+
- typedef struct DeintV4L2M2MContextShared {
-     void * logctx;  // For logging - will be NULL when done
-     filter_type_v4l2_t filter_type;
- 
-     int fd;
--    int done;
-+    int done;   // fd closed - awating all refs dropped
-     int width;
-     int height;
- 
-+    int drain;          // EOS received (inlink status)
-+    drain_state_t drain_state;
-+    int64_t drain_pts;  // PTS associated with inline status
-+
-+    unsigned int frames_rx;
-+    unsigned int frames_tx;
-+
-     // from options
-     int output_width;
-     int output_height;
-     enum AVPixelFormat output_format;
- 
-+    int has_enc_stop;
-+    // We expect to get exactly the same number of frames out as we put in
-+    // We can drain by matching input to output
-+    int one_to_one;
-+
-     int orig_width;
-     int orig_height;
-     atomic_uint refcount;
-@@ -179,6 +201,12 @@ typedef struct DeintV4L2M2MContext {
-     enum AVChromaLocation chroma_location;
- } DeintV4L2M2MContext;
- 
-+
-+static inline int drain_frame_expected(const drain_state_t d)
-+{
-+    return d == DRAIN_EOS || d == DRAIN_LAST;
-+}
-+
- // These just list the ones we know we can cope with
- static uint32_t
- fmt_av_to_v4l2(const enum AVPixelFormat avfmt)
-@@ -334,6 +362,13 @@ fail:
-     return 0;
- }
- 
-+// We are only ever expecting in-order frames so nothing more clever is required
-+static unsigned int
-+pts_track_count(const pts_track_t * const trk)
-+{
-+    return (trk->n - trk->last_n) & (PTS_TRACK_SIZE - 1);
-+}
-+
- static struct timeval pts_track_add_frame(pts_track_t * const trk, const AVFrame * const src)
- {
-     const uint32_t n = pts_track_next_n(trk);
-@@ -406,6 +441,12 @@ fmt_pixelformat(const struct v4l2_format * const fmt)
-     return V4L2_TYPE_IS_MULTIPLANAR(fmt->type) ? fmt->fmt.pix_mp.pixelformat : fmt->fmt.pix.pixelformat;
- }
- 
-+static inline uint32_t
-+buf_bytesused0(const struct v4l2_buffer * const buf)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(buf->type) ? buf->m.planes[0].bytesused : buf->bytesused;
-+}
-+
- static void
- init_format(V4L2Queue * const q, const uint32_t format_type)
- {
-@@ -1469,12 +1510,24 @@ static int deint_v4l2m2m_dequeue_frame(V4L2Queue *queue, AVFrame* frame, int tim
- 
-     av_log(ctx->logctx, AV_LOG_TRACE, "<<< %s\n", __func__);
- 
-+    if (queue->eos) {
-+        av_log(ctx->logctx, AV_LOG_TRACE, ">>> %s: EOS\n", __func__);
-+        return AVERROR_EOF;
-+    }
-+
-     avbuf = deint_v4l2m2m_dequeue_buffer(queue, timeout);
-     if (!avbuf) {
-         av_log(ctx->logctx, AV_LOG_DEBUG, "%s: No buffer to dequeue (timeout=%d)\n", __func__, timeout);
-         return AVERROR(EAGAIN);
-     }
- 
-+    if (V4L2_TYPE_IS_CAPTURE(avbuf->buffer.type)) {
-+        if ((avbuf->buffer.flags & V4L2_BUF_FLAG_LAST) != 0)
-+            queue->eos = 1;
-+        if (buf_bytesused0(&avbuf->buffer) == 0)
-+            return queue->eos ? AVERROR_EOF : AVERROR(EINVAL);
-+    }
-+
-     // Fill in PTS and anciliary info from src frame
-     pts_track_get_frame(&ctx->track, avbuf->buffer.timestamp, frame);
- 
-@@ -1686,6 +1739,20 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-         else
-             ctx->field_order = V4L2_FIELD_INTERLACED_BT;
- 
-+        {
-+            struct v4l2_encoder_cmd ecmd = {
-+                .cmd = V4L2_ENC_CMD_STOP
-+            };
-+            ctx->has_enc_stop = 0;
-+            if (ioctl(ctx->fd, VIDIOC_TRY_ENCODER_CMD, &ecmd) == 0) {
-+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop succeeded\n");
-+                ctx->has_enc_stop = 1;
-+            }
-+            else {
-+                av_log(ctx->logctx, AV_LOG_DEBUG, "Test encode stop fail: %s\n", av_err2str(AVERROR(errno)));
-+            }
-+
-+        }
-     }
- 
-     ret = deint_v4l2m2m_enqueue_frame(output, in);
-@@ -1694,6 +1761,41 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-     return ret;
- }
- 
-+static int
-+ack_inlink(AVFilterContext * const avctx, DeintV4L2M2MContextShared *const s,
-+           AVFilterLink * const inlink)
-+{
-+    int instatus;
-+    int64_t inpts;
-+
-+    if (ff_inlink_acknowledge_status(inlink, &instatus, &inpts) <= 0)
-+        return 0;
-+
-+    s->drain      = instatus;
-+    s->drain_pts  = inpts;
-+    s->drain_state = DRAIN_TIMEOUT;
-+
-+    if (s->field_order == V4L2_FIELD_ANY) {  // Not yet started
-+        s->drain_state = DRAIN_DONE;
-+    }
-+    else if (s->one_to_one) {
-+        s->drain_state = DRAIN_LAST;
-+    }
-+    else if (s->has_enc_stop) {
-+        struct v4l2_encoder_cmd ecmd = {
-+            .cmd = V4L2_ENC_CMD_STOP
-+        };
-+        if (ioctl(s->fd, VIDIOC_ENCODER_CMD, &ecmd) == 0) {
-+            av_log(avctx->priv, AV_LOG_DEBUG, "Do Encode stop\n");
-+            s->drain_state = DRAIN_EOS;
-+        }
-+        else {
-+            av_log(avctx->priv, AV_LOG_WARNING, "Encode stop fail: %s\n", av_err2str(AVERROR(errno)));
-+        }
-+    }
-+    return 1;
-+}
-+
- static int deint_v4l2m2m_activate(AVFilterContext *avctx)
- {
-     DeintV4L2M2MContext * const priv = avctx->priv;
-@@ -1702,15 +1804,13 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
-     AVFilterLink * const inlink = avctx->inputs[0];
-     int n = 0;
-     int cn = 99;
--    int instatus = 0;
--    int64_t inpts = 0;
-     int did_something = 0;
- 
-     av_log(priv, AV_LOG_TRACE, "<<< %s\n", __func__);
- 
-     FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, avctx);
- 
--    ff_inlink_acknowledge_status(inlink, &instatus, &inpts);
-+    ack_inlink(avctx, s, inlink);
- 
-     if (!ff_outlink_frame_wanted(outlink)) {
-         av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
-@@ -1720,7 +1820,6 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
-         AVFrame * frame = av_frame_alloc();
-         int rv;
- 
--again:
-         recycle_q(&s->output);
-         n = count_enqueued(&s->output);
- 
-@@ -1729,10 +1828,21 @@ again:
-             return AVERROR(ENOMEM);
-         }
- 
--        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame, n > 4 ? 300 : 0);
-+        rv = deint_v4l2m2m_dequeue_frame(&s->capture, frame,
-+                                         drain_frame_expected(s->drain_state) || n > 4 ? 300 : 0);
-         if (rv != 0) {
-             av_frame_free(&frame);
--            if (rv != AVERROR(EAGAIN)) {
-+            if (rv == AVERROR_EOF) {
-+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ EOF\n", __func__);
-+                s->drain_state = DRAIN_DONE;
-+            }
-+            else if (rv == AVERROR(EAGAIN)) {
-+                if (s->drain_state != DRAIN_NONE) {
-+                    av_log(priv, AV_LOG_DEBUG, "%s: --- DQ empty - drain done\n", __func__);
-+                    s->drain_state = DRAIN_DONE;
-+                }
-+            }
-+            else {
-                 av_log(priv, AV_LOG_ERROR, ">>> %s: DQ fail: %s\n", __func__, av_err2str(rv));
-                 return rv;
-             }
-@@ -1742,29 +1852,30 @@ again:
-             // frame is always consumed by filter_frame - even on error despite
-             // a somewhat confusing comment in the header
-             rv = ff_filter_frame(outlink, frame);
--
--            if (instatus != 0) {
--                av_log(priv, AV_LOG_TRACE, "%s: eof loop\n", __func__);
--                goto again;
--            }
-+            ++s->frames_tx;
- 
-             av_log(priv, AV_LOG_TRACE, "%s: Filtered: %s\n", __func__, av_err2str(rv));
-             did_something = 1;
-+
-+            if (s->drain_state != DRAIN_NONE && pts_track_count(&s->track) == 0) {
-+                av_log(priv, AV_LOG_DEBUG, "%s: --- DQ last - drain done\n", __func__);
-+                s->drain_state = DRAIN_DONE;
-+            }
-         }
- 
-         cn = count_enqueued(&s->capture);
-     }
- 
--    if (instatus != 0) {
--        ff_outlink_set_status(outlink, instatus, inpts);
--        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(instatus));
-+    if (s->drain_state == DRAIN_DONE) {
-+        ff_outlink_set_status(outlink, s->drain, s->drain_pts);
-+        av_log(priv, AV_LOG_TRACE, ">>> %s: Status done: %s\n", __func__, av_err2str(s->drain));
-         return 0;
-     }
- 
-     recycle_q(&s->output);
-     n = count_enqueued(&s->output);
- 
--    while (n < 6) {
-+    while (n < 6 && !s->drain) {
-         AVFrame * frame;
-         int rv;
- 
-@@ -1775,8 +1886,13 @@ again:
- 
-         if (frame == NULL) {
-             av_log(priv, AV_LOG_TRACE, "%s: No frame\n", __func__);
-+            if (!ack_inlink(avctx, s, inlink)) {
-+                ff_inlink_request_frame(inlink);
-+                av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
-+            }
-             break;
-         }
-+        ++s->frames_rx;
- 
-         rv = deint_v4l2m2m_filter_frame(inlink, frame);
-         av_frame_free(&frame);
-@@ -1785,16 +1901,11 @@ again:
-             return rv;
- 
-         av_log(priv, AV_LOG_TRACE, "%s: Q frame\n", __func__);
--        ++n;
--    }
--
--    if (n < 6) {
--        ff_inlink_request_frame(inlink);
-         did_something = 1;
--        av_log(priv, AV_LOG_TRACE, "%s: req frame\n", __func__);
-+        ++n;
-     }
- 
--    if (n > 4 && ff_outlink_frame_wanted(outlink)) {
-+    if ((n > 4 || s->drain) && ff_outlink_frame_wanted(outlink)) {
-         ff_filter_set_ready(avctx, 1);
-         did_something = 1;
-         av_log(priv, AV_LOG_TRACE, "%s: ready\n", __func__);
-@@ -1873,7 +1984,18 @@ static av_cold int deint_v4l2m2m_init(AVFilterContext *avctx)
- 
- static av_cold int scale_v4l2m2m_init(AVFilterContext *avctx)
- {
--    return common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE);
-+    int rv;
-+    DeintV4L2M2MContext * priv;
-+    DeintV4L2M2MContextShared * ctx;
-+
-+    if ((rv = common_v4l2m2m_init(avctx, FILTER_V4L2_SCALE)) != 0)
-+        return rv;
-+
-+    priv = avctx->priv;
-+    ctx = priv->shared;
-+
-+    ctx->one_to_one = 1;
-+    return 0;
- }
- 
- static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
-@@ -1881,6 +2003,8 @@ static void deint_v4l2m2m_uninit(AVFilterContext *avctx)
-     DeintV4L2M2MContext *priv = avctx->priv;
-     DeintV4L2M2MContextShared *ctx = priv->shared;
- 
-+    av_log(priv, AV_LOG_VERBOSE, "Frames Rx: %u, Frames Tx: %u\n",
-+           ctx->frames_rx, ctx->frames_tx);
-     ctx->done = 1;
-     ctx->logctx = NULL;  // Log to NULL works, log to missing crashes
-     pts_track_uninit(&ctx->track);
-
-From f893891df8f4e7738b2d9b49df4386fb160eb25f Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 5 Oct 2022 16:12:02 +0000
-Subject: [PATCH 084/136] v4l2_m2m_dec: Rework decode pending heuristic
-
-The old code measured the length of the entire Q in the decoder and
-attempted to dynamically guess an appropriate length. This was prone to
-failure when the guesswork became confused.
-The new code attempts to measure the Q length before insertion into decode
-which, after all, is what we actually care about. It does this by
-asserting that the decoder must have consumed all packets that came
-before the one associated with the most recent CAPTURE frame.  This
-avoids all need for reorder buffer size guesswork.
----
- libavcodec/v4l2_m2m.h     |  2 -
- libavcodec/v4l2_m2m_dec.c | 77 +++++++++++++++++----------------------
- 2 files changed, 34 insertions(+), 45 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index babf101d65..26a7161042 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -107,8 +107,6 @@ typedef struct V4L2m2mContext {
- 
-     /* Frame tracking */
-     xlat_track_t xlat;
--    int pending_hw;
--    int pending_n;
- 
-     pts_stats_t pts_stat;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 6bd9926b3f..bec9b22fcf 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -349,41 +349,54 @@ static void
- xlat_flush(xlat_track_t * const x)
- {
-     unsigned int i;
-+    // Do not reset track_no - this ensures that any frames left in the decoder
-+    // that turn up later get discarded.
-+
-+    x->last_pts = AV_NOPTS_VALUE;
-+    x->last_opaque = 0;
-     for (i = 0; i != FF_V4L2_M2M_TRACK_SIZE; ++i) {
-         x->track_els[i].pending = 0;
-         x->track_els[i].discard = 1;
-     }
--    x->last_pts = AV_NOPTS_VALUE;
-+}
-+
-+static void
-+xlat_init(xlat_track_t * const x)
-+{
-+    memset(x, 0, sizeof(*x));
-+    xlat_flush(x);
- }
- 
- static int
- xlat_pending(const xlat_track_t * const x)
- {
-     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
--    unsigned int i;
--    int r = 0;
--    int64_t now = AV_NOPTS_VALUE;
-+    int i;
-+    const int64_t now = x->last_pts;
- 
--    for (i = 0; i < 32; ++i, n = (n - 1) % FF_V4L2_M2M_TRACK_SIZE) {
-+    for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
-         const V4L2m2mTrackEl * const t = x->track_els + n;
- 
-+        // Discard only set on never-set or flushed entries
-+        // So if we get here we've never successfully decoded a frame so allow
-+        // more frames into the buffer before stalling
-+        if (t->discard)
-+            return i - 16;
-+
-+        // If we've got this frame out then everything before this point
-+        // must have entered the decoder
-         if (!t->pending)
--            continue;
-+            break;
- 
-+        // If we've never seen a pts all we can do is count frames
-         if (now == AV_NOPTS_VALUE)
--            now = t->dts;
-+            continue;
- 
--        if (t->pts == AV_NOPTS_VALUE ||
--            ((now == AV_NOPTS_VALUE || t->pts <= now) &&
--             (x->last_pts == AV_NOPTS_VALUE || t->pts > x->last_pts)))
--            ++r;
-+        if (t->dts != AV_NOPTS_VALUE && now >= t->dts)
-+            break;
-     }
- 
--    // If we never get any ideas about PTS vs DTS allow a lot more buffer
--    if (now == AV_NOPTS_VALUE)
--        r -= 16;
--
--    return r;
-+    return i;
- }
- 
- static inline int stream_started(const V4L2m2mContext * const s) {
-@@ -557,18 +570,6 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
-     return rv;
- }
- 
--// Number of frames over what xlat_pending returns that we keep *16
--// This is a min value - if it appears to be too small the threshold should
--// adjust dynamically.
--#define PENDING_HW_MIN      (3 * 16)
--// Offset to use when setting dynamically
--// Set to %16 == 15 to avoid the threshold changing immediately as we relax
--#define PENDING_HW_OFFSET   (PENDING_HW_MIN - 1)
--// Number of consecutive times we've failed to get a frame when we prefer it
--// before we increase the prefer threshold (5ms * N = max expected decode
--// time)
--#define PENDING_N_THRESHOLD 6
--
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-@@ -578,9 +579,11 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- 
-     do {
-         const int pending = xlat_pending(&s->xlat);
--        const int prefer_dq = (pending > s->pending_hw / 16);
-+        const int prefer_dq = (pending > 3);
-         const int last_src_rv = src_rv;
- 
-+        av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
-+
-         // Enqueue another pkt for decode if
-         // (a) We don't have a lot of stuff in the buffer already OR
-         // (b) ... we (think we) do but we've failed to get a frame already OR
-@@ -625,20 +628,8 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 }
-             }
- 
--            // Adjust dynamic pending threshold
--            if (dst_rv == 0) {
--                if (--s->pending_hw < PENDING_HW_MIN)
--                    s->pending_hw = PENDING_HW_MIN;
--                s->pending_n = 0;
--
-+            if (dst_rv == 0)
-                 set_best_effort_pts(avctx, &s->pts_stat, frame);
--            }
--            else if (dst_rv == AVERROR(EAGAIN)) {
--                if (prefer_dq && ++s->pending_n > PENDING_N_THRESHOLD) {
--                    s->pending_hw = pending * 16 + PENDING_HW_OFFSET;
--                    s->pending_n = 0;
--                }
--            }
- 
-             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-@@ -857,8 +848,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if (ret < 0)
-         return ret;
- 
-+    xlat_init(&s->xlat);
-     pts_stats_init(&s->pts_stat, avctx, "decoder");
--    s->pending_hw = PENDING_HW_MIN;
- 
-     capture = &s->capture;
-     output = &s->output;
-
-From 7048e7e6b8621cf09b96cc7e44b8d82ba8619913 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 21 Oct 2022 13:48:07 +0000
-Subject: [PATCH 085/136] pthread_frame: Fix MT hwaccel. Recent change broke
- it.
-
-Revert the effects of 35aa7e70e7ec350319e7634a30d8d8aa1e6ecdda if the
-hwaccel is marked MT_SAFE.
----
- libavcodec/pthread_frame.c | 48 ++++++++++++++++++++++++++++----------
- 1 file changed, 36 insertions(+), 12 deletions(-)
-
-diff --git a/libavcodec/pthread_frame.c b/libavcodec/pthread_frame.c
-index 2cc89a41f5..b14f8e9360 100644
---- a/libavcodec/pthread_frame.c
-+++ b/libavcodec/pthread_frame.c
-@@ -231,7 +231,7 @@ static attribute_align_arg void *frame_worker_thread(void *arg)
-             p->hwaccel_serializing = 0;
-             pthread_mutex_unlock(&p->parent->hwaccel_mutex);
-         }
--        av_assert0(!avctx->hwaccel);
-+        av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
- 
-         if (p->async_serializing) {
-             p->async_serializing = 0;
-@@ -319,6 +319,12 @@ FF_ENABLE_DEPRECATION_WARNINGS
-         }
- 
-         dst->hwaccel_flags = src->hwaccel_flags;
-+        if (src->hwaccel &&
-+            (src->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-+            dst->hwaccel = src->hwaccel;
-+            dst->hwaccel_context = src->hwaccel_context;
-+            dst->internal->hwaccel_priv_data = src->internal->hwaccel_priv_data;
-+        }
- 
-         err = av_buffer_replace(&dst->internal->pool, src->internal->pool);
-         if (err < 0)
-@@ -434,10 +440,13 @@ static int submit_packet(PerThreadContext *p, AVCodecContext *user_avctx,
-     }
- 
-     /* transfer the stashed hwaccel state, if any */
--    av_assert0(!p->avctx->hwaccel);
--    FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
--    FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
--    FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-+    av_assert0(!p->avctx->hwaccel || (p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
-+    if (p->avctx->hwaccel &&
-+        !(p->avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-+        FFSWAP(const AVHWAccel*, p->avctx->hwaccel,                     fctx->stash_hwaccel);
-+        FFSWAP(void*,            p->avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-+        FFSWAP(void*,            p->avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-+    }
- 
-     av_packet_unref(p->avpkt);
-     ret = av_packet_ref(p->avpkt, avpkt);
-@@ -610,9 +619,12 @@ void ff_thread_finish_setup(AVCodecContext *avctx) {
-      * this is done here so that this worker thread can wipe its own hwaccel
-      * state after decoding, without requiring synchronization */
-     av_assert0(!p->parent->stash_hwaccel);
--    p->parent->stash_hwaccel         = avctx->hwaccel;
--    p->parent->stash_hwaccel_context = avctx->hwaccel_context;
--    p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
-+    if (avctx->hwaccel &&
-+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-+        p->parent->stash_hwaccel         = avctx->hwaccel;
-+        p->parent->stash_hwaccel_context = avctx->hwaccel_context;
-+        p->parent->stash_hwaccel_priv    = avctx->internal->hwaccel_priv_data;
-+    }
- 
-     pthread_mutex_lock(&p->progress_mutex);
-     if(atomic_load(&p->state) == STATE_SETUP_FINISHED){
-@@ -667,6 +679,15 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
- 
-     park_frame_worker_threads(fctx, thread_count);
- 
-+     if (fctx->prev_thread &&
-+         avctx->hwaccel && (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE) &&
-+         avctx->internal->hwaccel_priv_data !=
-+                             fctx->prev_thread->avctx->internal->hwaccel_priv_data) {
-+        if (update_context_from_thread(avctx, fctx->prev_thread->avctx, 1) < 0) {
-+            av_log(avctx, AV_LOG_ERROR, "Failed to update user thread.\n");
-+        }
-+    }
-+
-     for (i = 0; i < thread_count; i++) {
-         PerThreadContext *p = &fctx->threads[i];
-         AVCodecContext *ctx = p->avctx;
-@@ -710,10 +731,13 @@ void ff_frame_thread_free(AVCodecContext *avctx, int thread_count)
- 
-     /* if we have stashed hwaccel state, move it to the user-facing context,
-      * so it will be freed in avcodec_close() */
--    av_assert0(!avctx->hwaccel);
--    FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
--    FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
--    FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-+    av_assert0(!avctx->hwaccel || (avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE));
-+    if (avctx->hwaccel &&
-+        !(avctx->hwaccel->caps_internal & HWACCEL_CAP_MT_SAFE)) {
-+        FFSWAP(const AVHWAccel*, avctx->hwaccel,                     fctx->stash_hwaccel);
-+        FFSWAP(void*,            avctx->hwaccel_context,             fctx->stash_hwaccel_context);
-+        FFSWAP(void*,            avctx->internal->hwaccel_priv_data, fctx->stash_hwaccel_priv);
-+    }
- 
-     av_freep(&avctx->internal->thread_ctx);
- }
-
-From 033056bd8ec63b16fe081446f70f41b5d5789b81 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 13:18:27 +0000
-Subject: [PATCH 086/136] v4l2_req: Add swfmt to init logging
-
-(cherry picked from commit dfa03b702baaf2952bcd2bbf8badcc2f9c961ddf)
----
- libavcodec/v4l2_request_hevc.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 614a1b4d99..767ecb036a 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -26,6 +26,7 @@
- #include "v4l2_request_hevc.h"
- 
- #include "libavutil/hwcontext_drm.h"
-+#include "libavutil/pixdesc.h"
- 
- #include "v4l2_req_devscan.h"
- #include "v4l2_req_dmabufs.h"
-@@ -306,10 +307,11 @@ retry_src_memtype:
-     // Set our s/w format
-     avctx->sw_pix_fmt = ((AVHWFramesContext *)avctx->hw_frames_ctx->data)->sw_format;
- 
--    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s\n",
-+    av_log(avctx, AV_LOG_INFO, "Hwaccel %s; devices: %s,%s; buffers: src %s, dst %s; swfmt=%s\n",
-            ctx->fns->name,
-            decdev_media_path(decdev), decdev_video_path(decdev),
--           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype));
-+           mediabufs_memory_name(src_memtype), mediabufs_memory_name(dst_memtype),
-+           av_get_pix_fmt_name(avctx->sw_pix_fmt));
- 
-     return 0;
- 
-
-From 70779e742b93015e3e8aaa8f945a12d35917844d Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 13:39:54 +0000
-Subject: [PATCH 087/136] v4l2_m2m: Avoid polling on a queue that is streamoff
-
-(cherry picked from commit b2658bc56d3034a17db7f39597fc7d71bfe9a43b)
----
- libavcodec/v4l2_context.c | 13 +++++++++----
- 1 file changed, 9 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 4a359bf45e..b296dc111c 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -578,6 +578,11 @@ get_event(V4L2m2mContext * const m)
-     return 0;
- }
- 
-+static inline int
-+dq_ok(const V4L2Context * const c)
-+{
-+    return c->streamon && atomic_load(&c->q_count) != 0;
-+}
- 
- // Get a buffer
- // If output then just gets the buffer in the expected way
-@@ -613,13 +618,13 @@ get_qbuf(V4L2Context * const ctx, V4L2Buffer ** const ppavbuf, const int timeout
-         }
- 
-         // If capture && timeout == -1 then also wait for rx buffer free
--        if (is_cap && timeout == -1 && m->output.streamon && !m->draining)
-+        if (is_cap && timeout == -1 && dq_ok(&m->output) && !m->draining)
-             pfd.events |= poll_out;
- 
-         // If nothing Qed all we will get is POLLERR - avoid that
--        if ((pfd.events == poll_out && atomic_load(&m->output.q_count) == 0) ||
--            (pfd.events == poll_cap && atomic_load(&m->capture.q_count) == 0) ||
--            (pfd.events == (poll_cap | poll_out) && atomic_load(&m->capture.q_count) == 0 && atomic_load(&m->output.q_count) == 0)) {
-+        if ((pfd.events == poll_out && !dq_ok(&m->output)) ||
-+            (pfd.events == poll_cap && !dq_ok(&m->capture)) ||
-+            (pfd.events == (poll_cap | poll_out) && !dq_ok(&m->capture) && !dq_ok(&m->output))) {
-             av_log(avctx, AV_LOG_TRACE, "V4L2 poll %s empty\n", ctx->name);
-             return AVERROR(ENOSPC);
-         }
-
-From 438fed3702eb689f836c885ebbd813e48d4d4c4a Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 14:07:04 +0000
-Subject: [PATCH 088/136] v4l2_m2m: Add function to get number of queued
- buffers
-
-(cherry picked from commit f9ac6485c00b4531dcff354222aef450b29728f4)
----
- libavcodec/v4l2_context.h | 11 +++++++++++
- 1 file changed, 11 insertions(+)
-
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 523c53e97d..8e4f681643 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -220,4 +220,15 @@ int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
- 
- void ff_v4l2_dq_all(V4L2Context *const ctx);
- 
-+/**
-+ * Returns the number of buffers currently queued
-+ *
-+ * @param[in] ctx The V4L2Context to evaluate
-+ */
-+static inline int
-+ff_v4l2_context_q_count(const V4L2Context* const ctx)
-+{
-+    return atomic_load(&ctx->q_count);
-+}
-+
- #endif // AVCODEC_V4L2_CONTEXT_H
-
-From 95ff4a65ed4c88ea7e02ee55e260e37a0ce2ba88 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 14:48:20 +0000
-Subject: [PATCH 089/136] v4l2_m2m: Add timeouts to dq_all and dequeue_packet
-
-Add timeouts and use them to have better flow control in encode
-
-(cherry picked from commit c6173cad7f21697e12887982bda796de9719bb32)
----
- libavcodec/v4l2_context.c | 16 +++++++++++-----
- libavcodec/v4l2_context.h | 15 +++++++++++++--
- libavcodec/v4l2_m2m_enc.c | 28 +++++++++++++++++++---------
- 3 files changed, 43 insertions(+), 16 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index b296dc111c..7031f3d340 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -712,13 +712,19 @@ clean_v4l2_buffer(V4L2Buffer * const avbuf)
-     return avbuf;
- }
- 
--void
--ff_v4l2_dq_all(V4L2Context *const ctx)
-+int
-+ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1)
- {
-     V4L2Buffer * avbuf;
-+    if (timeout1 != 0) {
-+        int rv = get_qbuf(ctx, &avbuf, timeout1);
-+        if (rv != 0)
-+            return rv;
-+    }
-     do {
-         get_qbuf(ctx, &avbuf, 0);
-     } while (avbuf);
-+    return 0;
- }
- 
- static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
-@@ -727,7 +733,7 @@ static V4L2Buffer* v4l2_getfree_v4l2buf(V4L2Context *ctx)
- 
-     /* get back as many output buffers as possible */
-     if (V4L2_TYPE_IS_OUTPUT(ctx->type))
--        ff_v4l2_dq_all(ctx);
-+        ff_v4l2_dq_all(ctx, 0);
- 
-     for (i = 0; i < ctx->num_buffers; i++) {
-         V4L2Buffer * const avbuf = (V4L2Buffer *)ctx->bufrefs[i]->data;
-@@ -1047,7 +1053,7 @@ int ff_v4l2_context_dequeue_frame(V4L2Context* ctx, AVFrame* frame, int timeout)
-    return 0;
- }
- 
--int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout)
- {
-     V4L2m2mContext *s = ctx_to_m2mctx(ctx);
-     AVCodecContext *const avctx = s->avctx;
-@@ -1055,7 +1061,7 @@ int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt)
-     int rv;
- 
-     do {
--        if ((rv = get_qbuf(ctx, &avbuf, -1)) != 0)
-+        if ((rv = get_qbuf(ctx, &avbuf, timeout)) != 0)
-             return rv == AVERROR(ENOSPC) ? AVERROR(EAGAIN) : rv;  // Caller not currently expecting ENOSPC
-         if ((rv = ff_v4l2_buffer_buf_to_avpkt(pkt, avbuf)) != 0)
-             return rv;
-diff --git a/libavcodec/v4l2_context.h b/libavcodec/v4l2_context.h
-index 8e4f681643..5afed3e6ec 100644
---- a/libavcodec/v4l2_context.h
-+++ b/libavcodec/v4l2_context.h
-@@ -179,7 +179,7 @@ int ff_v4l2_context_set_status(V4L2Context* ctx, uint32_t cmd);
-  * @param[inout] pkt The AVPacket to dequeue to.
-  * @return 0 in case of success, AVERROR(EAGAIN) if no buffer was ready, another negative error in case of error.
-  */
--int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt);
-+int ff_v4l2_context_dequeue_packet(V4L2Context* ctx, AVPacket* pkt, int timeout);
- 
- /**
-  * Dequeues a buffer from a V4L2Context to an AVFrame.
-@@ -218,7 +218,18 @@ int ff_v4l2_context_enqueue_packet(V4L2Context* ctx, const AVPacket* pkt, const
-  */
- int ff_v4l2_context_enqueue_frame(V4L2Context* ctx, const AVFrame* f);
- 
--void ff_v4l2_dq_all(V4L2Context *const ctx);
-+/**
-+ * Dequeue all buffers on this queue
-+ *
-+ * Used to recycle output buffers
-+ *
-+ * @param[in] ctx The V4L2Context to dequeue from.
-+ * @param[in] timeout1 A timeout on dequeuing the 1st buffer, 
-+ *       all others have a timeout of zero
-+ * @return AVERROR(EAGAIN) if timeout1 non-zero then the return
-+ *         of the first dequeue operation, 0 otherwise.
-+ */
-+int ff_v4l2_dq_all(V4L2Context *const ctx, int timeout1);
- 
- /**
-  * Returns the number of buffers currently queued
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index a992a3cccc..d0d27e5bc2 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -420,16 +420,24 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const output = &s->output;
-+    int rv;
-+    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;
- 
--    ff_v4l2_dq_all(output);
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
- 
--    // Signal EOF if needed
-+    // Signal EOF if needed (doesn't need q slot)
-     if (!frame) {
-         return ff_v4l2_context_enqueue_frame(output, frame);
-     }
- 
-+    if ((rv = ff_v4l2_dq_all(output, needs_slot? 500 : 0)) != 0) {
-+        // We should be able to return AVERROR(EAGAIN) to indicate buffer
-+        // exhaustion, but ffmpeg currently treats that as fatal.
-+        av_log(avctx, AV_LOG_WARNING, "Failed to get buffer for src frame: %s\n", av_err2str(rv));
-+        return rv;
-+    }
-+
-     if (s->input_drm && !output->streamon) {
--        int rv;
-         struct v4l2_format req_format = {.type = output->format.type};
- 
-         // Set format when we first get a buffer
-@@ -494,7 +502,9 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     AVFrame *frame = s->frame;
-     int ret;
- 
--    ff_v4l2_dq_all(output);
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+
-+    ff_v4l2_dq_all(output, 0);
- 
-     if (s->draining)
-         goto dequeue;
-@@ -532,10 +542,10 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     }
- 
- dequeue:
--    ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
--    ff_v4l2_dq_all(output);
-+    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
-+    ff_v4l2_dq_all(output, 0);
-     if (ret)
--        return ret;
-+        return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
- 
-     if (capture->first_buf == 1) {
-         uint8_t * data;
-@@ -566,8 +576,8 @@ dequeue:
-             s->extdata_size = len;
-         }
- 
--        ret = ff_v4l2_context_dequeue_packet(capture, avpkt);
--        ff_v4l2_dq_all(output);
-+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, 0);
-+        ff_v4l2_dq_all(output, 0);
-         if (ret)
-             return ret;
-     }
-
-From e6654c1997a6f4dfd43b0f74b0168f5d644c1c74 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 14:23:32 +0000
-Subject: [PATCH 090/136] v4l2_m2m_enc: Improve debug trace
-
-(cherry picked from commit 113e89daffb329a0cd3d920abd483a4025664bf5)
----
- libavcodec/v4l2_m2m_enc.c | 13 ++++++++++---
- 1 file changed, 10 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index d0d27e5bc2..c8c2de3d47 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -427,6 +427,7 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
- 
-     // Signal EOF if needed (doesn't need q slot)
-     if (!frame) {
-+        av_log(avctx, AV_LOG_TRACE, "--- %s: EOS\n", __func__);
-         return ff_v4l2_context_enqueue_frame(output, frame);
-     }
- 
-@@ -491,7 +492,12 @@ static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
-         v4l2_set_ext_ctrl(s, MPEG_CID(FORCE_KEY_FRAME), 0, "force key frame", 1);
- #endif
- 
--    return ff_v4l2_context_enqueue_frame(output, frame);
-+    rv = ff_v4l2_context_enqueue_frame(output, frame);
-+    if (rv) {
-+        av_log(avctx, AV_LOG_ERROR, "Enqueue frame failed: %s\n", av_err2str(rv));
-+    }
-+
-+    return rv;
- }
- 
- static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-@@ -502,7 +508,8 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     AVFrame *frame = s->frame;
-     int ret;
- 
--    av_log(avctx, AV_LOG_TRACE, "<<< %s\n", __func__);
-+    av_log(avctx, AV_LOG_TRACE, "<<< %s: qlen out %d cap %d\n", __func__,
-+           ff_v4l2_context_q_count(output), ff_v4l2_context_q_count(capture));
- 
-     ff_v4l2_dq_all(output, 0);
- 
-@@ -615,11 +622,11 @@ dequeue:
-         avpkt->size = newlen;
-     }
- 
--//    av_log(avctx, AV_LOG_INFO, "%s: PTS out=%"PRId64", size=%d, ret=%d\n", __func__, avpkt->pts, avpkt->size, ret);
-     capture->first_buf = 0;
-     return 0;
- 
- fail_no_mem:
-+    av_log(avctx, AV_LOG_ERROR, "Rx pkt failed: No memory\n");
-     ret = AVERROR(ENOMEM);
-     av_packet_unref(avpkt);
-     return ret;
-
-From 02dca2b845125af7ec6dfb68bdc34726a45fee9c Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 18 Oct 2022 13:22:36 +0000
-Subject: [PATCH 091/136] v4l2_m2m_enc: Copy dest packets to memory if short of
- v4l2 buffers
-
-(cherry picked from commit aa4ebbda400b42db952fc713b26927fc8636b0e5)
----
- libavcodec/v4l2_m2m_enc.c | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index c8c2de3d47..c23187e6e6 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -621,6 +621,22 @@ dequeue:
-         avpkt->data = buf->data;
-         avpkt->size = newlen;
-     }
-+    else if (ff_v4l2_context_q_count(capture) < 2) {
-+        // Avoid running out of capture buffers
-+        // In most cases the buffers will be returned quickly in which case
-+        // we don't copy and can use the v4l2 buffers directly but sometimes
-+        // ffmpeg seems to hold onto all of them for a long time (.mkv
-+        // creation?) so avoid deadlock in those cases.
-+        AVBufferRef * const buf = av_buffer_alloc(avpkt->size + AV_INPUT_BUFFER_PADDING_SIZE);
-+        if (buf == NULL)
-+            goto fail_no_mem;
-+
-+        memcpy(buf->data, avpkt->data, avpkt->size);
-+        av_buffer_unref(&avpkt->buf);  // Will recycle the V4L2 buffer
-+
-+        avpkt->buf = buf;
-+        avpkt->data = buf->data;
-+    }
- 
-     capture->first_buf = 0;
-     return 0;
-
-From ced9a7d442a04be08fc23e0af310312299a5d5a0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Oct 2022 11:00:16 +0000
-Subject: [PATCH 092/136] v4l2_m2m_dec: Fix pts_best_effort guessing for
- initial pts
-
-(cherry picked from commit 1af32e5c87586a0f7e76cdf19a012ddbcf3eac67)
----
- libavcodec/v4l2_m2m_dec.c | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index bec9b22fcf..47b2735f82 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -113,6 +113,8 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
- 
- static int64_t pts_stats_guess(const pts_stats_t * const stats)
- {
-+    if (stats->last_count <= 1)
-+        return stats->last_pts;
-     if (stats->last_pts == AV_NOPTS_VALUE ||
-             stats->last_interval == 0 ||
-             stats->last_count >= STATS_LAST_COUNT_MAX)
-
-From 3e3cf6ed7280d8ad4f3eed17a6d18c2df3c0cd31 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Oct 2022 14:47:04 +0000
-Subject: [PATCH 093/136] v4l2_m2m_enc: Wait for frame or space in src Q in
- rx_pkt
-
-If receive_packet we should ensure that there is space in the source Q
-if we return EAGAIN so wait for either an output packet or space if
-the source Q is currently full.
-
-(cherry picked from commit 82f0c55782a67a8cc665d937647706c2a75f5548)
----
- libavcodec/v4l2_m2m_enc.c | 22 +++++++++++++++++++---
- 1 file changed, 19 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_enc.c b/libavcodec/v4l2_m2m_enc.c
-index c23187e6e6..524e9424a5 100644
---- a/libavcodec/v4l2_m2m_enc.c
-+++ b/libavcodec/v4l2_m2m_enc.c
-@@ -415,13 +415,17 @@ static int fmt_eq(const struct v4l2_format * const a, const struct v4l2_format *
-     return 1;
- }
- 
-+static inline int q_full(const V4L2Context *const output)
-+{
-+    return ff_v4l2_context_q_count(output) == output->num_buffers;
-+}
- 
- static int v4l2_send_frame(AVCodecContext *avctx, const AVFrame *frame)
- {
-     V4L2m2mContext *s = ((V4L2m2mPriv*)avctx->priv_data)->context;
-     V4L2Context *const output = &s->output;
-     int rv;
--    int needs_slot = ff_v4l2_context_q_count(output) == output->num_buffers;
-+    const int needs_slot = q_full(output);
- 
-     av_log(avctx, AV_LOG_TRACE, "<<< %s; needs_slot=%d\n", __func__, needs_slot);
- 
-@@ -549,8 +553,20 @@ static int v4l2_receive_packet(AVCodecContext *avctx, AVPacket *avpkt)
-     }
- 
- dequeue:
--    ret = ff_v4l2_context_dequeue_packet(capture, avpkt, s->draining ? 300 : 0);
--    ff_v4l2_dq_all(output, 0);
-+    // Dequeue a frame
-+    for (;;) {
-+        int t = q_full(output) ? -1 : s->draining ? 300 : 0;
-+        int rv2;
-+
-+        // If output is full wait for either a packet or output to become not full
-+        ret = ff_v4l2_context_dequeue_packet(capture, avpkt, t);
-+
-+        // If output was full retry packet dequeue
-+        t = (ret != AVERROR(EAGAIN) || t != -1) ? 0 : 300;
-+        rv2 = ff_v4l2_dq_all(output, t);
-+        if (t == 0 || rv2 != 0)
-+            break;
-+    }
-     if (ret)
-         return (s->draining && ret == AVERROR(EAGAIN)) ? AVERROR_EOF : ret;
- 
-
-From de9ec2bf6421b199aad9ea9dc7896a46c8813d94 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Oct 2022 14:54:29 +0000
-Subject: [PATCH 094/136] vf_deinterlace_v4l2m2m: Print dts rather that NOPTS
- in trace
-
-(cherry picked from commit e9b468f35f0c6ad9bfe96f5a05e449afa8ae074a)
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index ce875c2c61..7c6751b69c 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -1668,8 +1668,8 @@ static int deint_v4l2m2m_filter_frame(AVFilterLink *link, AVFrame *in)
-     V4L2Queue *output              = &ctx->output;
-     int ret;
- 
--    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" (%"PRId64") field :%d interlaced: %d aspect:%d/%d\n",
--          __func__, in->pts, AV_NOPTS_VALUE, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
-+    av_log(priv, AV_LOG_DEBUG, "<<< %s: input pts: %"PRId64" dts: %"PRId64" field :%d interlaced: %d aspect:%d/%d\n",
-+           __func__, in->pts, in->pkt_dts, in->top_field_first, in->interlaced_frame, in->sample_aspect_ratio.num, in->sample_aspect_ratio.den);
-     av_log(priv, AV_LOG_DEBUG, "--- %s: in status in %d/ot %d; out status in %d/out %d\n", __func__,
-            avctx->inputs[0]->status_in, avctx->inputs[0]->status_out, avctx->outputs[0]->status_in, avctx->outputs[0]->status_out);
- 
-
-From d71a0a173240e18d518ae0b921ac43849524bd66 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Oct 2022 14:55:21 +0000
-Subject: [PATCH 095/136] vf_deinterlace_v4l2m2m: Ignore "wanted" when
- processing input
-
-If we gate send a frame to the outlink on its frame_wanted flag then we
-will sometimes stall as the flag may not get set by ffmpeg's filter
-processing. So stuff the output whether or not it wants it which works
-much better.
-
-(cherry picked from commit 808254cc04e5e6574cbab9af254b6c2f3d4142e3)
----
- libavfilter/vf_deinterlace_v4l2m2m.c | 5 +----
- 1 file changed, 1 insertion(+), 4 deletions(-)
-
-diff --git a/libavfilter/vf_deinterlace_v4l2m2m.c b/libavfilter/vf_deinterlace_v4l2m2m.c
-index 7c6751b69c..a173a291f8 100644
---- a/libavfilter/vf_deinterlace_v4l2m2m.c
-+++ b/libavfilter/vf_deinterlace_v4l2m2m.c
-@@ -1812,10 +1812,7 @@ static int deint_v4l2m2m_activate(AVFilterContext *avctx)
- 
-     ack_inlink(avctx, s, inlink);
- 
--    if (!ff_outlink_frame_wanted(outlink)) {
--        av_log(priv, AV_LOG_TRACE, "%s: Not wanted out\n", __func__);
--    }
--    else if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
-+    if (s->field_order != V4L2_FIELD_ANY)  // Can't DQ if no setup!
-     {
-         AVFrame * frame = av_frame_alloc();
-         int rv;
-
-From 842e0a00288f9a2a862720990791b8eca9546955 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Oct 2022 15:00:43 +0000
-Subject: [PATCH 096/136] conf_native: Add --enable-gpl
-
-(cherry picked from commit bab9bf4a2e39391940d88af2ce5d70236ac21f15)
----
- pi-util/conf_native.sh | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-index f22d531ca4..082d9b5832 100755
---- a/pi-util/conf_native.sh
-+++ b/pi-util/conf_native.sh
-@@ -94,6 +94,7 @@ $FFSRC/configure \
-  --enable-libdrm\
-  --enable-vout-egl\
-  --enable-vout-drm\
-+ --enable-gpl\
-  $SHARED_LIBS\
-  $RPIOPTS\
-  --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-
-From bf9aaf30818308a4651e00a2a64a0f65dc9a36e5 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 15 Nov 2022 13:33:00 +0000
-Subject: [PATCH 097/136] egl_vout: Make formatting consistent - no code
- changes
-
----
- libavdevice/egl_vout.c | 741 ++++++++++++++++++++---------------------
- 1 file changed, 369 insertions(+), 372 deletions(-)
-
-diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
-index 7b9c610ace..a52cabb082 100644
---- a/libavdevice/egl_vout.c
-+++ b/libavdevice/egl_vout.c
-@@ -48,20 +48,20 @@
- #define TRACE_ALL 0
- 
- struct egl_setup {
--   int conId;
--
--   Display *dpy;
--   EGLDisplay egl_dpy;
--   EGLContext ctx;
--   EGLSurface surf;
--   Window win;
--
--   uint32_t crtcId;
--   int crtcIdx;
--   uint32_t planeId;
--   struct {
--       int x, y, width, height;
--   } compose;
-+    int conId;
-+
-+    Display *dpy;
-+    EGLDisplay egl_dpy;
-+    EGLContext ctx;
-+    EGLSurface surf;
-+    Window win;
-+
-+    uint32_t crtcId;
-+    int crtcIdx;
-+    uint32_t planeId;
-+    struct {
-+        int x, y, width, height;
-+    } compose;
- };
- 
- typedef struct egl_aux_s {
-@@ -70,8 +70,7 @@ typedef struct egl_aux_s {
- 
- } egl_aux_t;
- 
--typedef struct egl_display_env_s
--{
-+typedef struct egl_display_env_s {
-     AVClass *class;
- 
-     struct egl_setup setup;
-@@ -89,8 +88,8 @@ typedef struct egl_display_env_s
-     sem_t display_start_sem;
-     sem_t q_sem;
-     int q_terminate;
--    AVFrame * q_this;
--    AVFrame * q_next;
-+    AVFrame *q_this;
-+    AVFrame *q_next;
- 
- } egl_display_env_t;
- 
-@@ -99,45 +98,44 @@ typedef struct egl_display_env_s
-  * Remove window border/decorations.
-  */
- static void
--no_border( Display *dpy, Window w)
-+no_border(Display *dpy, Window w)
- {
--   static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
--   static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
--
--   typedef struct
--   {
--      unsigned long       flags;
--      unsigned long       functions;
--      unsigned long       decorations;
--      long                inputMode;
--      unsigned long       status;
--   } PropMotifWmHints;
--
--   PropMotifWmHints motif_hints;
--   Atom prop, proptype;
--   unsigned long flags = 0;
--
--   /* setup the property */
--   motif_hints.flags = MWM_HINTS_DECORATIONS;
--   motif_hints.decorations = flags;
--
--   /* get the atom for the property */
--   prop = XInternAtom( dpy, "_MOTIF_WM_HINTS", True );
--   if (!prop) {
--      /* something went wrong! */
--      return;
--   }
--
--   /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
--   proptype = prop;
--
--   XChangeProperty( dpy, w,                         /* display, window */
-+    static const unsigned MWM_HINTS_DECORATIONS = (1 << 1);
-+    static const int PROP_MOTIF_WM_HINTS_ELEMENTS = 5;
-+
-+    typedef struct {
-+        unsigned long       flags;
-+        unsigned long       functions;
-+        unsigned long       decorations;
-+        long                inputMode;
-+        unsigned long       status;
-+    } PropMotifWmHints;
-+
-+    PropMotifWmHints motif_hints;
-+    Atom prop, proptype;
-+    unsigned long flags = 0;
-+
-+    /* setup the property */
-+    motif_hints.flags = MWM_HINTS_DECORATIONS;
-+    motif_hints.decorations = flags;
-+
-+    /* get the atom for the property */
-+    prop = XInternAtom(dpy, "_MOTIF_WM_HINTS", True);
-+    if (!prop) {
-+        /* something went wrong! */
-+        return;
-+    }
-+
-+    /* not sure this is correct, seems to work, XA_WM_HINTS didn't work */
-+    proptype = prop;
-+
-+    XChangeProperty(dpy, w,                         /* display, window */
-                     prop, proptype,                 /* property, type */
-                     32,                             /* format: 32-bit datums */
-                     PropModeReplace,                /* mode */
--                    (unsigned char *) &motif_hints, /* data */
-+                    (unsigned char *)&motif_hints, /* data */
-                     PROP_MOTIF_WM_HINTS_ELEMENTS    /* nelements */
--                  );
-+                   );
- }
- 
- 
-@@ -146,247 +144,247 @@ no_border( Display *dpy, Window w)
-  * Return the window and context handles.
-  */
- static int
--make_window(struct AVFormatContext * const s,
--            egl_display_env_t * const de,
-+make_window(struct AVFormatContext *const s,
-+            egl_display_env_t *const de,
-             Display *dpy, EGLDisplay egl_dpy, const char *name,
-             Window *winRet, EGLContext *ctxRet, EGLSurface *surfRet)
- {
--   int scrnum = DefaultScreen( dpy );
--   XSetWindowAttributes attr;
--   unsigned long mask;
--   Window root = RootWindow( dpy, scrnum );
--   Window win;
--   EGLContext ctx;
--   const int fullscreen = de->fullscreen;
--   EGLConfig config;
--   int x = de->window_x;
--   int y = de->window_y;
--   int width = de->window_width ? de->window_width : 1280;
--   int height = de->window_height ? de->window_height : 720;
--
--
--   if (fullscreen) {
--      int scrnum = DefaultScreen(dpy);
--
--      x = 0; y = 0;
--      width = DisplayWidth(dpy, scrnum);
--      height = DisplayHeight(dpy, scrnum);
--   }
--
--   {
--      EGLint num_configs;
--      static const EGLint attribs[] = {
--         EGL_RED_SIZE, 1,
--         EGL_GREEN_SIZE, 1,
--         EGL_BLUE_SIZE, 1,
--         EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
--         EGL_NONE
--      };
--
--      if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
--         av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
--         return -1;
--      }
--   }
--
--   {
--      EGLint vid;
--      if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
--         av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
--         return -1;
--      }
--
--      {
--         XVisualInfo visTemplate = {
--            .visualid = vid,
--         };
--         int num_visuals;
--         XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
--                                               &visTemplate, &num_visuals);
--
--         /* window attributes */
--         attr.background_pixel = 0;
--         attr.border_pixel = 0;
--         attr.colormap = XCreateColormap( dpy, root, visinfo->visual, AllocNone);
--         attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
--         /* XXX this is a bad way to get a borderless window! */
--         mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
--
--         win = XCreateWindow( dpy, root, x, y, width, height,
--                              0, visinfo->depth, InputOutput,
--                              visinfo->visual, mask, &attr );
--         XFree(visinfo);
--      }
--   }
--
--   if (fullscreen)
--      no_border(dpy, win);
--
--   /* set hints and properties */
--   {
--      XSizeHints sizehints;
--      sizehints.x = x;
--      sizehints.y = y;
--      sizehints.width  = width;
--      sizehints.height = height;
--      sizehints.flags = USSize | USPosition;
--      XSetNormalHints(dpy, win, &sizehints);
--      XSetStandardProperties(dpy, win, name, name,
--                              None, (char **)NULL, 0, &sizehints);
--   }
--
--   eglBindAPI(EGL_OPENGL_ES_API);
--
--   {
--      static const EGLint ctx_attribs[] = {
--         EGL_CONTEXT_CLIENT_VERSION, 2,
--         EGL_NONE
--      };
--      ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs );
--      if (!ctx) {
--         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
--         return -1;
--      }
--   }
--
--
--   XMapWindow(dpy, win);
--
--   {
--      EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
--      if (!surf) {
--         av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
--         return -1;
--      }
--
--      if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
--         av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
--         return -1;
--      }
--
--      *winRet = win;
--      *ctxRet = ctx;
--      *surfRet = surf;
--   }
--
--   return 0;
-+    int scrnum = DefaultScreen(dpy);
-+    XSetWindowAttributes attr;
-+    unsigned long mask;
-+    Window root = RootWindow(dpy, scrnum);
-+    Window win;
-+    EGLContext ctx;
-+    const int fullscreen = de->fullscreen;
-+    EGLConfig config;
-+    int x = de->window_x;
-+    int y = de->window_y;
-+    int width = de->window_width ? de->window_width : 1280;
-+    int height = de->window_height ? de->window_height : 720;
-+
-+
-+    if (fullscreen) {
-+        int scrnum = DefaultScreen(dpy);
-+
-+        x = 0; y = 0;
-+        width = DisplayWidth(dpy, scrnum);
-+        height = DisplayHeight(dpy, scrnum);
-+    }
-+
-+    {
-+        EGLint num_configs;
-+        static const EGLint attribs[] = {
-+            EGL_RED_SIZE, 1,
-+            EGL_GREEN_SIZE, 1,
-+            EGL_BLUE_SIZE, 1,
-+            EGL_RENDERABLE_TYPE, EGL_OPENGL_ES2_BIT,
-+            EGL_NONE
-+        };
-+
-+        if (!eglChooseConfig(egl_dpy, attribs, &config, 1, &num_configs)) {
-+            av_log(s, AV_LOG_ERROR, "Error: couldn't get an EGL visual config\n");
-+            return -1;
-+        }
-+    }
-+
-+    {
-+        EGLint vid;
-+        if (!eglGetConfigAttrib(egl_dpy, config, EGL_NATIVE_VISUAL_ID, &vid)) {
-+            av_log(s, AV_LOG_ERROR, "Error: eglGetConfigAttrib() failed\n");
-+            return -1;
-+        }
-+
-+        {
-+            XVisualInfo visTemplate = {
-+                .visualid = vid,
-+            };
-+            int num_visuals;
-+            XVisualInfo *visinfo = XGetVisualInfo(dpy, VisualIDMask,
-+                                                  &visTemplate, &num_visuals);
-+
-+            /* window attributes */
-+            attr.background_pixel = 0;
-+            attr.border_pixel = 0;
-+            attr.colormap = XCreateColormap(dpy, root, visinfo->visual, AllocNone);
-+            attr.event_mask = StructureNotifyMask | ExposureMask | KeyPressMask;
-+            /* XXX this is a bad way to get a borderless window! */
-+            mask = CWBackPixel | CWBorderPixel | CWColormap | CWEventMask;
-+
-+            win = XCreateWindow(dpy, root, x, y, width, height,
-+                                0, visinfo->depth, InputOutput,
-+                                visinfo->visual, mask, &attr);
-+            XFree(visinfo);
-+        }
-+    }
-+
-+    if (fullscreen)
-+        no_border(dpy, win);
-+
-+    /* set hints and properties */
-+    {
-+        XSizeHints sizehints;
-+        sizehints.x = x;
-+        sizehints.y = y;
-+        sizehints.width  = width;
-+        sizehints.height = height;
-+        sizehints.flags = USSize | USPosition;
-+        XSetNormalHints(dpy, win, &sizehints);
-+        XSetStandardProperties(dpy, win, name, name,
-+                               None, (char **)NULL, 0, &sizehints);
-+    }
-+
-+    eglBindAPI(EGL_OPENGL_ES_API);
-+
-+    {
-+        static const EGLint ctx_attribs[] = {
-+            EGL_CONTEXT_CLIENT_VERSION, 2,
-+            EGL_NONE
-+        };
-+        ctx = eglCreateContext(egl_dpy, config, EGL_NO_CONTEXT, ctx_attribs);
-+        if (!ctx) {
-+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+            return -1;
-+        }
-+    }
-+
-+
-+    XMapWindow(dpy, win);
-+
-+    {
-+        EGLSurface surf = eglCreateWindowSurface(egl_dpy, config, (EGLNativeWindowType)win, NULL);
-+        if (!surf) {
-+            av_log(s, AV_LOG_ERROR, "Error: eglCreateWindowSurface failed\n");
-+            return -1;
-+        }
-+
-+        if (!eglMakeCurrent(egl_dpy, surf, surf, ctx)) {
-+            av_log(s, AV_LOG_ERROR, "Error: eglCreateContext failed\n");
-+            return -1;
-+        }
-+
-+        *winRet = win;
-+        *ctxRet = ctx;
-+        *surfRet = surf;
-+    }
-+
-+    return 0;
- }
- 
- static GLint
--compile_shader(struct AVFormatContext * const avctx, GLenum target, const char *source)
-+compile_shader(struct AVFormatContext *const avctx, GLenum target, const char *source)
- {
--   GLuint s = glCreateShader(target);
-+    GLuint s = glCreateShader(target);
- 
--   if (s == 0) {
--      av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
--      return 0;
--   }
-+    if (s == 0) {
-+        av_log(avctx, AV_LOG_ERROR, "Failed to create shader\n");
-+        return 0;
-+    }
- 
--   glShaderSource(s, 1, (const GLchar **) &source, NULL);
--   glCompileShader(s);
-+    glShaderSource(s, 1, (const GLchar **)&source, NULL);
-+    glCompileShader(s);
- 
--   {
--      GLint ok;
--      glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
-+    {
-+        GLint ok;
-+        glGetShaderiv(s, GL_COMPILE_STATUS, &ok);
- 
--      if (!ok) {
--         GLchar *info;
--         GLint size;
-+        if (!ok) {
-+            GLchar *info;
-+            GLint size;
- 
--         glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
--         info = malloc(size);
-+            glGetShaderiv(s, GL_INFO_LOG_LENGTH, &size);
-+            info = malloc(size);
- 
--         glGetShaderInfoLog(s, size, NULL, info);
--         av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
-+            glGetShaderInfoLog(s, size, NULL, info);
-+            av_log(avctx, AV_LOG_ERROR, "Failed to compile shader: %ssource:\n%s\n", info, source);
- 
--         return 0;
--      }
--   }
-+            return 0;
-+        }
-+    }
- 
--   return s;
-+    return s;
- }
- 
--static GLuint link_program(struct AVFormatContext * const s, GLint vs, GLint fs)
-+static GLuint link_program(struct AVFormatContext *const s, GLint vs, GLint fs)
- {
--   GLuint prog = glCreateProgram();
--
--   if (prog == 0) {
--      av_log(s, AV_LOG_ERROR, "Failed to create program\n");
--      return 0;
--   }
--
--   glAttachShader(prog, vs);
--   glAttachShader(prog, fs);
--   glLinkProgram(prog);
--
--   {
--      GLint ok;
--      glGetProgramiv(prog, GL_LINK_STATUS, &ok);
--      if (!ok) {
--         /* Some drivers return a size of 1 for an empty log.  This is the size
--          * of a log that contains only a terminating NUL character.
--          */
--         GLint size;
--         GLchar *info = NULL;
--         glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
--         if (size > 1) {
--            info = malloc(size);
--            glGetProgramInfoLog(prog, size, NULL, info);
--         }
-+    GLuint prog = glCreateProgram();
- 
--         av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
--                 (info != NULL) ? info : "<empty log>");
--         return 0;
--      }
--   }
-+    if (prog == 0) {
-+        av_log(s, AV_LOG_ERROR, "Failed to create program\n");
-+        return 0;
-+    }
-+
-+    glAttachShader(prog, vs);
-+    glAttachShader(prog, fs);
-+    glLinkProgram(prog);
-+
-+    {
-+        GLint ok;
-+        glGetProgramiv(prog, GL_LINK_STATUS, &ok);
-+        if (!ok) {
-+            /* Some drivers return a size of 1 for an empty log.  This is the size
-+             * of a log that contains only a terminating NUL character.
-+             */
-+            GLint size;
-+            GLchar *info = NULL;
-+            glGetProgramiv(prog, GL_INFO_LOG_LENGTH, &size);
-+            if (size > 1) {
-+                info = malloc(size);
-+                glGetProgramInfoLog(prog, size, NULL, info);
-+            }
- 
--   return prog;
-+            av_log(s, AV_LOG_ERROR, "Failed to link: %s\n",
-+                   (info != NULL) ? info : "<empty log>");
-+            return 0;
-+        }
-+    }
-+
-+    return prog;
- }
- 
- static int
--gl_setup(struct AVFormatContext * const s)
-+gl_setup(struct AVFormatContext *const s)
- {
--   const char *vs =
--      "attribute vec4 pos;\n"
--      "varying vec2 texcoord;\n"
--      "\n"
--      "void main() {\n"
--      "  gl_Position = pos;\n"
--      "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
--      "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
--      "}\n";
--   const char *fs =
--      "#extension GL_OES_EGL_image_external : enable\n"
--      "precision mediump float;\n"
--      "uniform samplerExternalOES s;\n"
--      "varying vec2 texcoord;\n"
--      "void main() {\n"
--      "  gl_FragColor = texture2D(s, texcoord);\n"
--      "}\n";
--
--   GLuint vs_s;
--   GLuint fs_s;
--   GLuint prog;
--
--   if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
--       !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
--       !(prog = link_program(s, vs_s, fs_s)))
--      return -1;
--
--   glUseProgram(prog);
--
--   {
--      static const float verts[] = {
--         -1, -1,
--         1, -1,
--         1, 1,
--         -1, 1,
--      };
--      glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
--   }
--
--   glEnableVertexAttribArray(0);
--   return 0;
-+    const char *vs =
-+        "attribute vec4 pos;\n"
-+        "varying vec2 texcoord;\n"
-+        "\n"
-+        "void main() {\n"
-+        "  gl_Position = pos;\n"
-+        "  texcoord.x = (pos.x + 1.0) / 2.0;\n"
-+        "  texcoord.y = (-pos.y + 1.0) / 2.0;\n"
-+        "}\n";
-+    const char *fs =
-+        "#extension GL_OES_EGL_image_external : enable\n"
-+        "precision mediump float;\n"
-+        "uniform samplerExternalOES s;\n"
-+        "varying vec2 texcoord;\n"
-+        "void main() {\n"
-+        "  gl_FragColor = texture2D(s, texcoord);\n"
-+        "}\n";
-+
-+    GLuint vs_s;
-+    GLuint fs_s;
-+    GLuint prog;
-+
-+    if (!(vs_s = compile_shader(s, GL_VERTEX_SHADER, vs)) ||
-+        !(fs_s = compile_shader(s, GL_FRAGMENT_SHADER, fs)) ||
-+        !(prog = link_program(s, vs_s, fs_s)))
-+        return -1;
-+
-+    glUseProgram(prog);
-+
-+    {
-+        static const float verts[] = {
-+            -1, -1,
-+            1, -1,
-+            1,  1,
-+            -1,  1,
-+        };
-+        glVertexAttribPointer(0, 2, GL_FLOAT, GL_FALSE, 0, verts);
-+    }
-+
-+    glEnableVertexAttribArray(0);
-+    return 0;
- }
- 
- static int egl_vout_write_trailer(AVFormatContext *s)
-@@ -400,12 +398,12 @@ static int egl_vout_write_trailer(AVFormatContext *s)
- 
- static int egl_vout_write_header(AVFormatContext *s)
- {
--    const AVCodecParameters * const par = s->streams[0]->codecpar;
-+    const AVCodecParameters *const par = s->streams[0]->codecpar;
- 
- #if TRACE_ALL
-     av_log(s, AV_LOG_INFO, "%s\n", __func__);
- #endif
--    if (   s->nb_streams > 1
-+    if (s->nb_streams > 1
-         || par->codec_type != AVMEDIA_TYPE_VIDEO
-         || par->codec_id   != AV_CODEC_ID_WRAPPED_AVFRAME) {
-         av_log(s, AV_LOG_ERROR, "Only supports one wrapped avframe stream\n");
-@@ -416,10 +414,10 @@ static int egl_vout_write_header(AVFormatContext *s)
- }
- 
- 
--static int do_display(AVFormatContext * const s, egl_display_env_t * const de, AVFrame * const frame)
-+static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVFrame *const frame)
- {
--    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor*)frame->data[0];
--    egl_aux_t * da = NULL;
-+    const AVDRMFrameDescriptor *desc = (AVDRMFrameDescriptor *)frame->data[0];
-+    egl_aux_t *da = NULL;
-     unsigned int i;
- 
- #if TRACE_ALL
-@@ -440,26 +438,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
- 
-     if (da->texture == 0) {
-         EGLint attribs[50];
--        EGLint * a = attribs;
-+        EGLint *a = attribs;
-         int i, j;
-         static const EGLint anames[] = {
--           EGL_DMA_BUF_PLANE0_FD_EXT,
--           EGL_DMA_BUF_PLANE0_OFFSET_EXT,
--           EGL_DMA_BUF_PLANE0_PITCH_EXT,
--           EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
--           EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
--           EGL_DMA_BUF_PLANE1_FD_EXT,
--           EGL_DMA_BUF_PLANE1_OFFSET_EXT,
--           EGL_DMA_BUF_PLANE1_PITCH_EXT,
--           EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
--           EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
--           EGL_DMA_BUF_PLANE2_FD_EXT,
--           EGL_DMA_BUF_PLANE2_OFFSET_EXT,
--           EGL_DMA_BUF_PLANE2_PITCH_EXT,
--           EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
--           EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
-+            EGL_DMA_BUF_PLANE0_FD_EXT,
-+            EGL_DMA_BUF_PLANE0_OFFSET_EXT,
-+            EGL_DMA_BUF_PLANE0_PITCH_EXT,
-+            EGL_DMA_BUF_PLANE0_MODIFIER_LO_EXT,
-+            EGL_DMA_BUF_PLANE0_MODIFIER_HI_EXT,
-+            EGL_DMA_BUF_PLANE1_FD_EXT,
-+            EGL_DMA_BUF_PLANE1_OFFSET_EXT,
-+            EGL_DMA_BUF_PLANE1_PITCH_EXT,
-+            EGL_DMA_BUF_PLANE1_MODIFIER_LO_EXT,
-+            EGL_DMA_BUF_PLANE1_MODIFIER_HI_EXT,
-+            EGL_DMA_BUF_PLANE2_FD_EXT,
-+            EGL_DMA_BUF_PLANE2_OFFSET_EXT,
-+            EGL_DMA_BUF_PLANE2_PITCH_EXT,
-+            EGL_DMA_BUF_PLANE2_MODIFIER_LO_EXT,
-+            EGL_DMA_BUF_PLANE2_MODIFIER_HI_EXT,
-         };
--        const EGLint * b = anames;
-+        const EGLint *b = anames;
- 
-         *a++ = EGL_WIDTH;
-         *a++ = av_frame_cropped_width(frame);
-@@ -470,8 +468,8 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
- 
-         for (i = 0; i < desc->nb_layers; ++i) {
-             for (j = 0; j < desc->layers[i].nb_planes; ++j) {
--                const AVDRMPlaneDescriptor * const p = desc->layers[i].planes + j;
--                const AVDRMObjectDescriptor * const obj = desc->objects + p->object_index;
-+                const AVDRMPlaneDescriptor *const p = desc->layers[i].planes + j;
-+                const AVDRMObjectDescriptor *const obj = desc->objects + p->object_index;
-                 *a++ = *b++;
-                 *a++ = obj->fd;
-                 *a++ = *b++;
-@@ -479,13 +477,13 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
-                 *a++ = *b++;
-                 *a++ = p->pitch;
-                 if (obj->format_modifier == 0) {
--                   b += 2;
-+                    b += 2;
-                 }
-                 else {
--                   *a++ = *b++;
--                   *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
--                   *a++ = *b++;
--                   *a++ = (EGLint)(obj->format_modifier >> 32);
-+                    *a++ = *b++;
-+                    *a++ = (EGLint)(obj->format_modifier & 0xFFFFFFFF);
-+                    *a++ = *b++;
-+                    *a++ = (EGLint)(obj->format_modifier >> 32);
-                 }
-             }
-         }
-@@ -494,26 +492,26 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
- 
- #if TRACE_ALL
-         for (a = attribs, i = 0; *a != EGL_NONE; a += 2, ++i) {
--           av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
-+            av_log(s, AV_LOG_INFO, "[%2d] %4x: %d\n", i, a[0], a[1]);
-         }
- #endif
-         {
--           const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
--                                              EGL_NO_CONTEXT,
--                                              EGL_LINUX_DMA_BUF_EXT,
--                                              NULL, attribs);
--           if (!image) {
--              av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
--              return -1;
--           }
--
--           glGenTextures(1, &da->texture);
--           glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
--           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
--           glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
--           glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
--
--           eglDestroyImageKHR(de->setup.egl_dpy, image);
-+            const EGLImage image = eglCreateImageKHR(de->setup.egl_dpy,
-+                                                     EGL_NO_CONTEXT,
-+                                                     EGL_LINUX_DMA_BUF_EXT,
-+                                                     NULL, attribs);
-+            if (!image) {
-+                av_log(s, AV_LOG_ERROR, "Failed to import fd %d\n", desc->objects[0].fd);
-+                return -1;
-+            }
-+
-+            glGenTextures(1, &da->texture);
-+            glBindTexture(GL_TEXTURE_EXTERNAL_OES, da->texture);
-+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
-+            glTexParameteri(GL_TEXTURE_EXTERNAL_OES, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
-+            glEGLImageTargetTexture2DOES(GL_TEXTURE_EXTERNAL_OES, image);
-+
-+            eglDestroyImageKHR(de->setup.egl_dpy, image);
-         }
- 
-         da->fd = desc->objects[0].fd;
-@@ -540,7 +538,7 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
-                (long long)modifiers[1],
-                (long long)modifiers[2],
-                (long long)modifiers[3]
--               );
-+              );
- #endif
-     }
- 
-@@ -558,55 +556,55 @@ static int do_display(AVFormatContext * const s, egl_display_env_t * const de, A
-     return 0;
- }
- 
--static void * display_thread(void * v)
-+static void* display_thread(void *v)
- {
--    AVFormatContext * const s = v;
--    egl_display_env_t * const de = s->priv_data;
-+    AVFormatContext *const s = v;
-+    egl_display_env_t *const de = s->priv_data;
- 
- #if TRACE_ALL
-     av_log(s, AV_LOG_INFO, "<<< %s\n", __func__);
- #endif
-     {
--       EGLint egl_major, egl_minor;
--
--       de->setup.dpy = XOpenDisplay(NULL);
--       if (!de->setup.dpy) {
--          av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
--          goto fail;
--       }
--
--       de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
--       if (!de->setup.egl_dpy) {
--          av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
--          goto fail;
--       }
--
--       if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
--           av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
--           goto fail;
--       }
--
--       av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
--
--       if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
--          av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
--          goto fail;
--       }
-+        EGLint egl_major, egl_minor;
-+
-+        de->setup.dpy = XOpenDisplay(NULL);
-+        if (!de->setup.dpy) {
-+            av_log(s, AV_LOG_ERROR, "Couldn't open X display\n");
-+            goto fail;
-+        }
-+
-+        de->setup.egl_dpy = eglGetDisplay(de->setup.dpy);
-+        if (!de->setup.egl_dpy) {
-+            av_log(s, AV_LOG_ERROR, "eglGetDisplay() failed\n");
-+            goto fail;
-+        }
-+
-+        if (!eglInitialize(de->setup.egl_dpy, &egl_major, &egl_minor)) {
-+            av_log(s, AV_LOG_ERROR, "Error: eglInitialize() failed\n");
-+            goto fail;
-+        }
-+
-+        av_log(s, AV_LOG_INFO, "EGL version %d.%d\n", egl_major, egl_minor);
-+
-+        if (!epoxy_has_egl_extension(de->setup.egl_dpy, "EGL_KHR_image_base")) {
-+            av_log(s, AV_LOG_ERROR, "Missing EGL KHR image extension\n");
-+            goto fail;
-+        }
-     }
- 
-     if (!de->window_width || !de->window_height) {
--       de->window_width = 1280;
--       de->window_height = 720;
-+        de->window_width = 1280;
-+        de->window_height = 720;
-     }
-     if (make_window(s, de, de->setup.dpy, de->setup.egl_dpy, "ffmpeg-vout",
-                     &de->setup.win, &de->setup.ctx, &de->setup.surf)) {
--       av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
--       goto fail;
-+        av_log(s, AV_LOG_ERROR, "%s: make_window failed\n", __func__);
-+        goto fail;
-     }
- 
-     if (gl_setup(s)) {
--       av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
--       goto fail;
-+        av_log(s, AV_LOG_ERROR, "%s: gl_setup failed\n", __func__);
-+        goto fail;
-     }
- 
- #if TRACE_ALL
-@@ -615,7 +613,7 @@ static void * display_thread(void * v)
-     sem_post(&de->display_start_sem);
- 
-     for (;;) {
--        AVFrame * frame;
-+        AVFrame *frame;
- 
-         while (sem_wait(&de->q_sem) != 0) {
-             av_assert0(errno == EINTR);
-@@ -653,9 +651,9 @@ fail:
- 
- static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
- {
--    const AVFrame * const src_frame = (AVFrame *)pkt->data;
--    AVFrame * frame;
--    egl_display_env_t * const de = s->priv_data;
-+    const AVFrame *const src_frame = (AVFrame *)pkt->data;
-+    AVFrame *frame;
-+    egl_display_env_t *const de = s->priv_data;
- 
- #if TRACE_ALL
-     av_log(s, AV_LOG_INFO, "%s\n", __func__);
-@@ -668,8 +666,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
-     else if (src_frame->format == AV_PIX_FMT_VAAPI) {
-         frame = av_frame_alloc();
-         frame->format = AV_PIX_FMT_DRM_PRIME;
--        if (av_hwframe_map(frame, src_frame, 0) != 0)
--        {
-+        if (av_hwframe_map(frame, src_frame, 0) != 0) {
-             av_log(s, AV_LOG_WARNING, "Failed to map frame (format=%d) to DRM_PRiME\n", src_frame->format);
-             av_frame_free(&frame);
-             return AVERROR(EINVAL);
-@@ -682,12 +679,12 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
- 
-     // Really hacky sync
-     while (de->show_all && de->q_next) {
--       usleep(3000);
-+        usleep(3000);
-     }
- 
-     pthread_mutex_lock(&de->q_lock);
-     {
--        AVFrame * const t = de->q_next;
-+        AVFrame *const t = de->q_next;
-         de->q_next = frame;
-         frame = t;
-     }
-@@ -702,7 +699,7 @@ static int egl_vout_write_packet(AVFormatContext *s, AVPacket *pkt)
- }
- 
- static int egl_vout_write_frame(AVFormatContext *s, int stream_index, AVFrame **ppframe,
--                          unsigned flags)
-+                                unsigned flags)
- {
-     av_log(s, AV_LOG_ERROR, "%s: NIF: idx=%d, flags=%#x\n", __func__, stream_index, flags);
-     return AVERROR_PATCHWELCOME;
-@@ -713,7 +710,7 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
- #if TRACE_ALL
-     av_log(s, AV_LOG_INFO, "%s: %d\n", __func__, type);
- #endif
--    switch(type) {
-+    switch (type) {
-     case AV_APP_TO_DEV_WINDOW_REPAINT:
-         return 0;
-     default:
-@@ -723,14 +720,14 @@ static int egl_vout_control_message(AVFormatContext *s, int type, void *data, si
- }
- 
- // deinit is called if init fails so no need to clean up explicity here
--static int egl_vout_init(struct AVFormatContext * s)
-+static int egl_vout_init(struct AVFormatContext *s)
- {
--    egl_display_env_t * const de = s->priv_data;
-+    egl_display_env_t *const de = s->priv_data;
-     unsigned int i;
- 
-     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
- 
--    de->setup = (struct egl_setup){0};
-+    de->setup = (struct egl_setup) { 0 };
- 
-     for (i = 0; i != 32; ++i) {
-         de->aux[i].fd = -1;
-@@ -744,8 +741,8 @@ static int egl_vout_init(struct AVFormatContext * s)
- 
-     sem_wait(&de->display_start_sem);
-     if (de->q_terminate) {
--       av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
--       return -1;
-+        av_log(s, AV_LOG_ERROR, "%s: Display startup failure\n", __func__);
-+        return -1;
-     }
- 
-     av_log(s, AV_LOG_DEBUG, ">>> %s\n", __func__);
-@@ -753,9 +750,9 @@ static int egl_vout_init(struct AVFormatContext * s)
-     return 0;
- }
- 
--static void egl_vout_deinit(struct AVFormatContext * s)
-+static void egl_vout_deinit(struct AVFormatContext *s)
- {
--    egl_display_env_t * const de = s->priv_data;
-+    egl_display_env_t *const de = s->priv_data;
- 
-     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
- 
-@@ -773,11 +770,11 @@ static void egl_vout_deinit(struct AVFormatContext * s)
- 
- #define OFFSET(x) offsetof(egl_display_env_t, x)
- static const AVOption options[] = {
--   { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
--   { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, {.str = NULL}, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
--   { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
--   { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    {.i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
--   { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   {.i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_size",  "set window forced size", OFFSET(window_width), AV_OPT_TYPE_IMAGE_SIZE, { .str = NULL }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_x",     "set window x offset",    OFFSET(window_x),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "window_y",     "set window y offset",    OFFSET(window_y),     AV_OPT_TYPE_INT,    { .i64 = 0 }, -INT_MAX, INT_MAX, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "fullscreen",   "set fullscreen display", OFFSET(fullscreen),   AV_OPT_TYPE_BOOL,   { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-     { NULL }
- 
- };
-
-From 4d3a3973a07994b0a6ec35626e514fc40f439fe3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Dec 2022 16:49:43 +0000
-Subject: [PATCH 098/136] v4l2m2m: reporganise get_raw_format for loop logic
-
----
- libavcodec/v4l2_context.c | 16 +++++-----------
- 1 file changed, 5 insertions(+), 11 deletions(-)
-
-diff --git a/libavcodec/v4l2_context.c b/libavcodec/v4l2_context.c
-index 7031f3d340..79a31cf930 100644
---- a/libavcodec/v4l2_context.c
-+++ b/libavcodec/v4l2_context.c
-@@ -828,28 +828,22 @@ static int v4l2_get_raw_format(V4L2Context* ctx, enum AVPixelFormat *p)
-             return 0;
-     }
- 
--    for (;;) {
-+    for (;; ++fdesc.index) {
-         ret = ioctl(ctx_to_m2mctx(ctx)->fd, VIDIOC_ENUM_FMT, &fdesc);
-         if (ret)
-             return AVERROR(EINVAL);
- 
-         if (priv->pix_fmt != AV_PIX_FMT_NONE) {
--            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt)) {
--                fdesc.index++;
-+            if (fdesc.pixelformat != ff_v4l2_format_avfmt_to_v4l2(priv->pix_fmt))
-                 continue;
--            }
-         }
- 
-         pixfmt = ff_v4l2_format_v4l2_to_avfmt(fdesc.pixelformat, AV_CODEC_ID_RAWVIDEO);
-         ret = v4l2_try_raw_format(ctx, pixfmt);
--        if (ret){
--            fdesc.index++;
--            continue;
-+        if (ret == 0) {
-+            *p = pixfmt;
-+            return 0;
-         }
--
--        *p = pixfmt;
--
--        return 0;
-     }
- 
-     return AVERROR(EINVAL);
-
-From 123c5ef429ec6bd7d1875d621df88bb2ad7af0bd Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Dec 2022 17:49:12 +0000
-Subject: [PATCH 099/136] drm_vout: Set zpos on the plane we pick to ensure it
- is at the front
-
----
- libavdevice/drm_vout.c | 38 +++++++++++++++++++++++++++++++++-----
- 1 file changed, 33 insertions(+), 5 deletions(-)
-
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-index cfb33ce7c3..9bd9e04421 100644
---- a/libavdevice/drm_vout.c
-+++ b/libavdevice/drm_vout.c
-@@ -115,9 +115,11 @@ static int find_plane(struct AVFormatContext * const avctx,
- {
-    drmModePlaneResPtr planes;
-    drmModePlanePtr plane;
-+   drmModeObjectPropertiesPtr props = NULL;
-+   drmModePropertyPtr prop = NULL;
-    unsigned int i;
-    unsigned int j;
--   int ret = 0;
-+   int ret = -1;
- 
-    planes = drmModeGetPlaneResources(drmfd);
-    if (!planes)
-@@ -154,11 +156,37 @@ static int find_plane(struct AVFormatContext * const avctx,
-       break;
-    }
- 
--   if (i == planes->count_planes)
--      ret = -1;
-+   if (i == planes->count_planes) {
-+       ret = -1;
-+       goto fail;
-+   }
- 
--   drmModeFreePlaneResources(planes);
--   return ret;
-+    props = drmModeObjectGetProperties(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE);
-+    if (!props)
-+        goto fail;
-+    for (i = 0; i != props->count_props; ++i) {
-+        if (prop)
-+            drmModeFreeProperty(prop);
-+        prop = drmModeGetProperty(drmfd, props->props[i]);
-+        if (!prop)
-+            goto fail;
-+        if (strcmp("zpos", prop->name) == 0) {
-+            if (drmModeObjectSetProperty(drmfd, *pplane_id, DRM_MODE_OBJECT_PLANE, props->props[i], prop->values[1]) == 0)
-+                av_log(avctx, AV_LOG_DEBUG, "ZPOS set to %d\n", (int)prop->values[1]);
-+            else
-+                av_log(avctx, AV_LOG_WARNING, "Failed to set ZPOS on DRM plane\n");
-+            break;
-+        }
-+    }
-+
-+    ret = 0;
-+fail:
-+    if (props)
-+        drmModeFreeObjectProperties(props);
-+    if (prop)
-+        drmModeFreeProperty(prop);
-+    drmModeFreePlaneResources(planes);
-+    return ret;
- }
- 
- static void da_uninit(drm_display_env_t * const de, drm_aux_t * da)
-
-From 0ee1c3b41774d05595376f8d25de2a901dbb12c7 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Dec 2022 17:51:46 +0000
-Subject: [PATCH 100/136] drm_vout: Only set modifier flag and pass modifiers
- if there are some
-
----
- libavdevice/drm_vout.c | 17 ++++++++++++-----
- 1 file changed, 12 insertions(+), 5 deletions(-)
-
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-index 9bd9e04421..a56adea866 100644
---- a/libavdevice/drm_vout.c
-+++ b/libavdevice/drm_vout.c
-@@ -34,6 +34,7 @@
- 
- #include <xf86drm.h>
- #include <xf86drmMode.h>
-+#include <drm_fourcc.h>
- 
- #define TRACE_ALL 0
- 
-@@ -249,6 +250,7 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
-         uint32_t offsets[4] = {0};
-         uint64_t modifiers[4] = {0};
-         uint32_t bo_handles[4] = {0};
-+        int has_mods = 0;
-         int i, j, n;
- 
-         da->frame = frame;
-@@ -258,6 +260,9 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
-                 av_log(s, AV_LOG_WARNING, "drmPrimeFDToHandle[%d](%d) failed: %s\n", i, desc->objects[i].fd, ERRSTR);
-                 return -1;
-             }
-+            if (desc->objects[i].format_modifier != DRM_FORMAT_MOD_LINEAR &&
-+                desc->objects[i].format_modifier != DRM_FORMAT_MOD_INVALID)
-+                has_mods = 1;
-         }
- 
-         n = 0;
-@@ -299,11 +304,13 @@ static int do_display(AVFormatContext * const s, drm_display_env_t * const de, A
- #endif
- 
-         if (drmModeAddFB2WithModifiers(de->drm_fd,
--                                         av_frame_cropped_width(frame),
--                                         av_frame_cropped_height(frame),
--                                         desc->layers[0].format, bo_handles,
--                                         pitches, offsets, modifiers,
--                                         &da->fb_handle, DRM_MODE_FB_MODIFIERS /** 0 if no mods */) != 0) {
-+                                       av_frame_cropped_width(frame),
-+                                       av_frame_cropped_height(frame),
-+                                       desc->layers[0].format, bo_handles,
-+                                       pitches, offsets,
-+                                       has_mods ? modifiers : NULL,
-+                                       &da->fb_handle,
-+                                       has_mods ? DRM_MODE_FB_MODIFIERS : 0) != 0) {
-             av_log(s, AV_LOG_WARNING, "drmModeAddFB2WithModifiers failed: %s\n", ERRSTR);
-             return -1;
-         }
-
-From 4534e6981c1718eaeec4c5f58cdf5592ee7f0329 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Dec 2022 17:52:58 +0000
-Subject: [PATCH 101/136] drm_vout: Fix typo in error message
-
----
- libavdevice/drm_vout.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-index a56adea866..351abf1d60 100644
---- a/libavdevice/drm_vout.c
-+++ b/libavdevice/drm_vout.c
-@@ -596,7 +596,7 @@ static int drm_vout_init(struct AVFormatContext * s)
-     sem_init(&de->q_sem_out, 0, 0);
-     if (pthread_create(&de->q_thread, NULL, display_thread, s)) {
-         rv = AVERROR(errno);
--        av_log(s, AV_LOG_ERROR, "Failed to creatye display thread: %s\n", av_err2str(rv));
-+        av_log(s, AV_LOG_ERROR, "Failed to create display thread: %s\n", av_err2str(rv));
-         goto fail_close;
-     }
- 
-
-From 0469d1fb132a0d55593611c56e83733efe58045b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 12 Dec 2022 18:00:41 +0000
-Subject: [PATCH 102/136] drm_vout: Add option to name the drm_module to use
-
----
- libavdevice/drm_vout.c | 8 +++++---
- 1 file changed, 5 insertions(+), 3 deletions(-)
-
-diff --git a/libavdevice/drm_vout.c b/libavdevice/drm_vout.c
-index 351abf1d60..491e1dc608 100644
---- a/libavdevice/drm_vout.c
-+++ b/libavdevice/drm_vout.c
-@@ -70,7 +70,9 @@ typedef struct drm_display_env_s
-     uint32_t con_id;
-     struct drm_setup setup;
-     enum AVPixelFormat avfmt;
-+
-     int show_all;
-+    const char * drm_module;
- 
-     unsigned int ano;
-     drm_aux_t aux[AUX_SIZE];
-@@ -569,7 +571,6 @@ static int drm_vout_init(struct AVFormatContext * s)
- {
-     drm_display_env_t * const de = s->priv_data;
-     int rv;
--    const char * drm_module = DRM_MODULE;
- 
-     av_log(s, AV_LOG_DEBUG, "<<< %s\n", __func__);
- 
-@@ -578,10 +579,10 @@ static int drm_vout_init(struct AVFormatContext * s)
-     de->setup = (struct drm_setup){0};
-     de->q_terminate = 0;
- 
--    if ((de->drm_fd = drmOpen(drm_module, NULL)) < 0)
-+    if ((de->drm_fd = drmOpen(de->drm_module, NULL)) < 0)
-     {
-         rv = AVERROR(errno);
--        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", drm_module, av_err2str(rv));
-+        av_log(s, AV_LOG_ERROR, "Failed to drmOpen %s: %s\n", de->drm_module, av_err2str(rv));
-         return rv;
-     }
- 
-@@ -641,6 +642,7 @@ static void drm_vout_deinit(struct AVFormatContext * s)
- #define OFFSET(x) offsetof(drm_display_env_t, x)
- static const AVOption options[] = {
-     { "show_all", "show all frames", OFFSET(show_all), AV_OPT_TYPE_BOOL, { .i64 = 0 }, 0, 1, AV_OPT_FLAG_ENCODING_PARAM },
-+    { "drm_module", "drm_module name to use, default=" DRM_MODULE, OFFSET(drm_module), AV_OPT_TYPE_STRING, { .str = DRM_MODULE }, 0, 0, AV_OPT_FLAG_ENCODING_PARAM },
-     { NULL }
- };
- 
-
-From 61cb9fc3ce06e0ecaeeec3add143bc3a82956853 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 13:01:00 +0000
-Subject: [PATCH 103/136] dmabufs: Rework to allow for non-CMA backends
-
----
- libavcodec/v4l2_req_dmabufs.c | 161 ++++++++++++++++++++++++----------
- 1 file changed, 116 insertions(+), 45 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
-index c4bbed18c6..1c3a5e861f 100644
---- a/libavcodec/v4l2_req_dmabufs.c
-+++ b/libavcodec/v4l2_req_dmabufs.c
-@@ -1,3 +1,4 @@
-+#include <stdatomic.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <unistd.h>
-@@ -19,9 +20,21 @@
- 
- #define TRACE_ALLOC 0
- 
-+struct dmabufs_ctl;
-+struct dmabuf_h;
-+
-+struct dmabuf_fns {
-+    int (*buf_alloc)(struct dmabufs_ctl * dbsc, struct dmabuf_h * dh, size_t size);
-+    void (*buf_free)(struct dmabuf_h * dh);
-+    int (*ctl_new)(struct dmabufs_ctl * dbsc);
-+    void (*ctl_free)(struct dmabufs_ctl * dbsc);
-+};
-+
- struct dmabufs_ctl {
-     int fd;
-     size_t page_size;
-+    void * v;
-+    const struct dmabuf_fns * fns;
- };
- 
- struct dmabuf_h {
-@@ -29,6 +42,8 @@ struct dmabuf_h {
-     size_t size;
-     size_t len;
-     void * mapptr;
-+    void * v;
-+    const struct dmabuf_fns * fns;
- };
- 
- #if TRACE_ALLOC
-@@ -88,15 +103,8 @@ struct dmabuf_h * dmabuf_import(int fd, size_t size)
- struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * old, size_t size)
- {
-     struct dmabuf_h * dh;
--    struct dma_heap_allocation_data data = {
--        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
--        .fd = 0,
--        .fd_flags = O_RDWR,
--        .heap_flags = 0
--    };
--
-     if (old != NULL) {
--        if (old->size == data.len) {
-+        if (old->size >= size) {
-             return old;
-         }
-         dmabuf_free(old);
-@@ -106,24 +114,16 @@ struct dmabuf_h * dmabuf_realloc(struct dmabufs_ctl * dbsc, struct dmabuf_h * ol
-         (dh = malloc(sizeof(*dh))) == NULL)
-         return NULL;
- 
--    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
--        int err = errno;
--        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
--                (uint64_t)data.len,
--                dbsc->fd,
--                err,
--                strerror(err));
--        if (err == EINTR)
--            continue;
--        goto fail;
--    }
--
-     *dh = (struct dmabuf_h){
--        .fd = data.fd,
--        .size = (size_t)data.len,
--        .mapptr = MAP_FAILED
-+        .fd = -1,
-+        .mapptr = MAP_FAILED,
-+        .fns = dbsc->fns
-     };
- 
-+    if (dh->fns->buf_alloc(dbsc, dh, size) != 0)
-+        goto fail;
-+
-+
- #if TRACE_ALLOC
-     ++total_bufs;
-     total_size += dh->size;
-@@ -220,8 +220,6 @@ void dmabuf_len_set(struct dmabuf_h * const dh, const size_t len)
-     dh->len = len;
- }
- 
--
--
- void dmabuf_free(struct dmabuf_h * dh)
- {
-     if (!dh)
-@@ -233,20 +231,63 @@ void dmabuf_free(struct dmabuf_h * dh)
-     request_log("%s: Free: %zd, total=%zd, bufs=%d\n", __func__, dh->size, total_size, total_bufs);
- #endif
- 
--    if (dh->mapptr != MAP_FAILED)
-+    dh->fns->buf_free(dh);
-+
-+    if (dh->mapptr != MAP_FAILED && dh->mapptr != NULL)
-         munmap(dh->mapptr, dh->size);
--    while (close(dh->fd) == -1 && errno == EINTR)
--        /* loop */;
-+    if (dh->fd != -1)
-+        while (close(dh->fd) == -1 && errno == EINTR)
-+            /* loop */;
-     free(dh);
- }
- 
--struct dmabufs_ctl * dmabufs_ctl_new(void)
-+static struct dmabufs_ctl * dmabufs_ctl_new2(const struct dmabuf_fns * const fns)
- {
--    struct dmabufs_ctl * dbsc = malloc(sizeof(*dbsc));
-+    struct dmabufs_ctl * dbsc = calloc(1, sizeof(*dbsc));
- 
-     if (!dbsc)
-         return NULL;
- 
-+    dbsc->fd = -1;
-+    dbsc->fns = fns;
-+    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-+
-+    if (fns->ctl_new(dbsc) != 0)
-+        goto fail;
-+
-+    return dbsc;
-+
-+fail:
-+    free(dbsc);
-+    return NULL;
-+}
-+
-+static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
-+{
-+    request_debug(NULL, "Free dmabuf ctl\n");
-+
-+    dbsc->fns->ctl_free(dbsc);
-+
-+    free(dbsc);
-+}
-+
-+void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-+{
-+    struct dmabufs_ctl * const dbsc = *pDbsc;
-+
-+    if (!dbsc)
-+        return;
-+    *pDbsc = NULL;
-+
-+    dmabufs_ctl_free(dbsc);
-+}
-+
-+//-----------------------------------------------------------------------------
-+//
-+// Alloc dmabuf via CMA
-+
-+static int ctl_cma_new(struct dmabufs_ctl * dbsc)
-+{
-     while ((dbsc->fd = open(DMABUF_NAME1, O_RDWR)) == -1 &&
-            errno == EINTR)
-         /* Loop */;
-@@ -258,31 +299,61 @@ struct dmabufs_ctl * dmabufs_ctl_new(void)
-         if (dbsc->fd == -1) {
-             request_log("Unable to open either %s or %s\n",
-                     DMABUF_NAME1, DMABUF_NAME2);
--            goto fail;
-+            return -1;
-         }
-     }
-+    return 0;
-+}
- 
--    dbsc->page_size = (size_t)sysconf(_SC_PAGE_SIZE);
--
--    return dbsc;
-+static void ctl_cma_free(struct dmabufs_ctl * dbsc)
-+{
-+    if (dbsc->fd != -1)
-+        while (close(dbsc->fd) == -1 && errno == EINTR)
-+            /* loop */;
- 
--fail:
--    free(dbsc);
--    return NULL;
- }
- 
--void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-+static int buf_cma_alloc(struct dmabufs_ctl * const dbsc, struct dmabuf_h * dh, size_t size)
- {
--    struct dmabufs_ctl * const dbsc = *pDbsc;
-+    struct dma_heap_allocation_data data = {
-+        .len = (size + dbsc->page_size - 1) & ~(dbsc->page_size - 1),
-+        .fd = 0,
-+        .fd_flags = O_RDWR,
-+        .heap_flags = 0
++    /* Source buffers are only as big as needed, since any over-read won't affect results */
++    LOCAL_ALIGNED_16(int16_t, src0, [64]);
++    LOCAL_ALIGNED_16(int16_t, src1, [64]);
++    /* Destination buffers have borders of one row above/below and 8 columns left/right to catch overflows */
++    LOCAL_ALIGNED_8(uint8_t, dst0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, dst1, [10 * 24]);
++
++    AVCodecContext avctx = { 0 };
++    IDCTDSPContext h;
++
++    const test tests[] = {
++        IDCTDSP_TEST(add_pixels_clamped)
++        IDCTDSP_TEST(put_pixels_clamped)
++        IDCTDSP_TEST(put_signed_pixels_clamped)
 +    };
- 
--    if (!dbsc)
--        return;
--    *pDbsc = NULL;
-+    while (ioctl(dbsc->fd, DMA_HEAP_IOCTL_ALLOC, &data)) {
-+        int err = errno;
-+        request_log("Failed to alloc %" PRIu64 " from dma-heap(fd=%d): %d (%s)\n",
-+                (uint64_t)data.len,
-+                dbsc->fd,
-+                err,
-+                strerror(err));
-+        if (err == EINTR)
-+            continue;
-+        return -err;
-+    }
- 
--    while (close(dbsc->fd) == -1 && errno == EINTR)
--        /* loop */;
-+    dh->fd = data.fd;
-+    dh->size = (size_t)data.len;
-+    return 0;
-+}
- 
--    free(dbsc);
-+static void buf_cma_free(struct dmabuf_h * dh)
-+{
-+    // Nothing needed
- }
- 
-+static const struct dmabuf_fns dmabuf_cma_fns = {
-+    .buf_alloc  = buf_cma_alloc,
-+    .buf_free   = buf_cma_free,
-+    .ctl_new    = ctl_cma_new,
-+    .ctl_free   = ctl_cma_free,
-+};
 +
-+struct dmabufs_ctl * dmabufs_ctl_new(void)
-+{
-+    request_debug(NULL, "Dmabufs using CMA\n");;
-+    return dmabufs_ctl_new2(&dmabuf_cma_fns);
-+}
- 
-
-From 288807720443bbddf4c83c3589d1877c7fd418c3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 13:07:58 +0000
-Subject: [PATCH 104/136] dmabufs: Use unref rather than deleet on cmabufs_ctl
-
----
- libavcodec/v4l2_req_dmabufs.c  | 12 +++++++++++-
- libavcodec/v4l2_req_dmabufs.h  |  3 ++-
- libavcodec/v4l2_request_hevc.c |  4 ++--
- 3 files changed, 15 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/v4l2_req_dmabufs.c b/libavcodec/v4l2_req_dmabufs.c
-index 1c3a5e861f..acc0366e76 100644
---- a/libavcodec/v4l2_req_dmabufs.c
-+++ b/libavcodec/v4l2_req_dmabufs.c
-@@ -31,6 +31,7 @@ struct dmabuf_fns {
- };
- 
- struct dmabufs_ctl {
-+    atomic_int ref_count;
-     int fd;
-     size_t page_size;
-     void * v;
-@@ -271,7 +272,7 @@ static void dmabufs_ctl_free(struct dmabufs_ctl * const dbsc)
-     free(dbsc);
- }
- 
--void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pDbsc)
- {
-     struct dmabufs_ctl * const dbsc = *pDbsc;
- 
-@@ -279,9 +280,18 @@ void dmabufs_ctl_delete(struct dmabufs_ctl ** const pDbsc)
-         return;
-     *pDbsc = NULL;
- 
-+    if (atomic_fetch_sub(&dbsc->ref_count, 1) != 0)
-+        return;
++    ff_idctdsp_init(&h, &avctx);
 +
-     dmabufs_ctl_free(dbsc);
- }
- 
-+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc)
-+{
-+    atomic_fetch_add(&dbsc->ref_count, 1);
-+    return dbsc;
-+}
-+
- //-----------------------------------------------------------------------------
- //
- // Alloc dmabuf via CMA
-diff --git a/libavcodec/v4l2_req_dmabufs.h b/libavcodec/v4l2_req_dmabufs.h
-index c1d3d8c8d7..381ba2708d 100644
---- a/libavcodec/v4l2_req_dmabufs.h
-+++ b/libavcodec/v4l2_req_dmabufs.h
-@@ -7,7 +7,8 @@ struct dmabufs_ctl;
- struct dmabuf_h;
- 
- struct dmabufs_ctl * dmabufs_ctl_new(void);
--void dmabufs_ctl_delete(struct dmabufs_ctl ** const pdbsc);
-+void dmabufs_ctl_unref(struct dmabufs_ctl ** const pdbsc);
-+struct dmabufs_ctl * dmabufs_ctl_ref(struct dmabufs_ctl * const dbsc);
- 
- // Need not preserve old contents
- // On NULL return old buffer is freed
-diff --git a/libavcodec/v4l2_request_hevc.c b/libavcodec/v4l2_request_hevc.c
-index 767ecb036a..db7ed13b6d 100644
---- a/libavcodec/v4l2_request_hevc.c
-+++ b/libavcodec/v4l2_request_hevc.c
-@@ -105,7 +105,7 @@ static int v4l2_request_hevc_uninit(AVCodecContext *avctx)
-     mediabufs_ctl_unref(&ctx->mbufs);
-     media_pool_delete(&ctx->mpool);
-     pollqueue_unref(&ctx->pq);
--    dmabufs_ctl_delete(&ctx->dbufs);
-+    dmabufs_ctl_unref(&ctx->dbufs);
-     devscan_delete(&ctx->devscan);
- 
-     decode_q_uninit(&ctx->decode_q);
-@@ -324,7 +324,7 @@ fail3:
- fail2:
-     pollqueue_unref(&ctx->pq);
- fail1:
--    dmabufs_ctl_delete(&ctx->dbufs);
-+    dmabufs_ctl_unref(&ctx->dbufs);
- fail0:
-     devscan_delete(&ctx->devscan);
-     return ret;
-
-From 9115f40c5f55873102312085f2e328d1a2101ae4 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 14:21:40 +0000
-Subject: [PATCH 105/136] egl_vout: Remove redundant & completely broken debug
-
----
- libavdevice/egl_vout.c | 25 -------------------------
- 1 file changed, 25 deletions(-)
-
-diff --git a/libavdevice/egl_vout.c b/libavdevice/egl_vout.c
-index a52cabb082..afc7afd13e 100644
---- a/libavdevice/egl_vout.c
-+++ b/libavdevice/egl_vout.c
-@@ -515,31 +515,6 @@ static int do_display(AVFormatContext *const s, egl_display_env_t *const de, AVF
-         }
- 
-         da->fd = desc->objects[0].fd;
--
--#if 0
--        av_log(s, AV_LOG_INFO, "%dx%d, fmt: %x, boh=%d,%d,%d,%d, pitch=%d,%d,%d,%d,"
--               " offset=%d,%d,%d,%d, mod=%llx,%llx,%llx,%llx\n",
--               av_frame_cropped_width(frame),
--               av_frame_cropped_height(frame),
--               desc->layers[0].format,
--               bo_plane_handles[0],
--               bo_plane_handles[1],
--               bo_plane_handles[2],
--               bo_plane_handles[3],
--               pitches[0],
--               pitches[1],
--               pitches[2],
--               pitches[3],
--               offsets[0],
--               offsets[1],
--               offsets[2],
--               offsets[3],
--               (long long)modifiers[0],
--               (long long)modifiers[1],
--               (long long)modifiers[2],
--               (long long)modifiers[3]
--              );
--#endif
-     }
- 
-     glClearColor(0.5, 0.5, 0.5, 0.5);
-
-From 34711d5a1429213b6f4cf8ad163e8e8d108626e7 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 16:12:12 +0000
-Subject: [PATCH 106/136] v4l2m2m: Use offset from querybuf rather than always
- 0
-
----
- libavcodec/v4l2_buffers.c | 4 +++-
- libavcodec/v4l2_buffers.h | 3 ++-
- 2 files changed, 5 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 9ef2f40e39..5ca58ea593 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -379,7 +379,7 @@ static uint8_t * v4l2_get_drm_frame(V4L2Buffer *avbuf)
- 
-     for (int i = 0; i < avbuf->num_planes; i++) {
-         layer->planes[i].object_index = i;
--        layer->planes[i].offset = 0;
-+        layer->planes[i].offset = avbuf->plane_info[i].offset;
-         layer->planes[i].pitch = avbuf->plane_info[i].bytesperline;
-     }
- 
-@@ -934,6 +934,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
- 
-         if (V4L2_TYPE_IS_MULTIPLANAR(ctx->type)) {
-             avbuf->plane_info[i].length = avbuf->buf.m.planes[i].length;
-+            avbuf->plane_info[i].offset = avbuf->buf.m.planes[i].data_offset;
- 
-             if (want_mmap)
-                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.m.planes[i].length,
-@@ -941,6 +942,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-                                                buf_to_m2mctx(avbuf)->fd, avbuf->buf.m.planes[i].m.mem_offset);
-         } else {
-             avbuf->plane_info[i].length = avbuf->buf.length;
-+            avbuf->plane_info[i].offset = 0;
- 
-             if (want_mmap)
-                 avbuf->plane_info[i].mm_addr = mmap(NULL, avbuf->buf.length,
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index 1ac32c5989..d91d5d1dd0 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -66,7 +66,8 @@ typedef struct V4L2Buffer {
- 
-     /* keep track of the mmap address and mmap length */
-     struct V4L2Plane_info {
--        int bytesperline;
-+        size_t bytesperline;
-+        size_t offset;
-         void * mm_addr;
-         size_t length;
-     } plane_info[VIDEO_MAX_PLANES];
-
-From 15458be3fe79c14f4fdcc2ad786508d1b647c914 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 17:57:27 +0000
-Subject: [PATCH 107/136] v4l2m2m: Fix crash if init errors out before setting
- avctx
-
----
- libavcodec/v4l2_m2m.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index 1e30d15fd8..ac6bae0dc3 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -278,7 +278,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
- 
-     av_log(s->avctx, AV_LOG_DEBUG, "V4L2 Codec end\n");
- 
--    if (av_codec_is_decoder(s->avctx->codec))
-+    if (s->avctx && av_codec_is_decoder(s->avctx->codec))
-         av_packet_unref(&s->buf_pkt);
- 
-     if (s->fd >= 0) {
-
-From 9f7f94c680b8aaedede9b3bcad37b645216cfcff Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 18:10:30 +0000
-Subject: [PATCH 108/136] v4l2_buffers: Add and use ctx_to_m2mctx + error debug
-
----
- libavcodec/v4l2_buffers.c | 22 +++++++++++++++-------
- 1 file changed, 15 insertions(+), 7 deletions(-)
-
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index 5ca58ea593..e28ef2d1e8 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -41,11 +41,16 @@
- #define USEC_PER_SEC 1000000
- static const AVRational v4l2_timebase = { 1, USEC_PER_SEC };
- 
-+static inline V4L2m2mContext *ctx_to_m2mctx(const V4L2Context *ctx)
-+{
-+    return V4L2_TYPE_IS_OUTPUT(ctx->type) ?
-+        container_of(ctx, V4L2m2mContext, output) :
-+        container_of(ctx, V4L2m2mContext, capture);
-+}
-+
- static inline V4L2m2mContext *buf_to_m2mctx(const V4L2Buffer * const buf)
- {
--    return V4L2_TYPE_IS_OUTPUT(buf->context->type) ?
--        container_of(buf->context, V4L2m2mContext, output) :
--        container_of(buf->context, V4L2m2mContext, capture);
-+    return ctx_to_m2mctx(buf->context);
- }
- 
- static inline AVCodecContext *logger(const V4L2Buffer * const buf)
-@@ -883,6 +888,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-     int ret, i;
-     V4L2Buffer * const avbuf = av_mallocz(sizeof(*avbuf));
-     AVBufferRef * bufref;
-+    V4L2m2mContext * const s = ctx_to_m2mctx(ctx);
- 
-     *pbufref = NULL;
-     if (avbuf == NULL)
-@@ -910,7 +916,7 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-         avbuf->buf.m.planes = avbuf->planes;
-     }
- 
--    ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_QUERYBUF, &avbuf->buf);
-+    ret = ioctl(s->fd, VIDIOC_QUERYBUF, &avbuf->buf);
-     if (ret < 0)
-         goto fail;
- 
-@@ -969,10 +975,12 @@ int ff_v4l2_buffer_initialize(AVBufferRef ** pbufref, int index, V4L2Context *ct
-     }
- 
-     if (!V4L2_TYPE_IS_OUTPUT(ctx->type)) {
--        if (buf_to_m2mctx(avbuf)->output_drm) {
-+        if (s->output_drm) {
-             ret = v4l2_buffer_export_drm(avbuf);
--            if (ret)
--                    goto fail;
-+            if (ret) {
-+                av_log(logger(avbuf), AV_LOG_ERROR, "Failed to get exported drm handles\n");
-+                goto fail;
-+            }
-         }
-     }
- 
-
-From 6b8bb2c41828351cd3a6f40be353696ae36450b7 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 18:53:22 +0000
-Subject: [PATCH 109/136] v4l2m2m: Add ability to use cma alloced dmabufs as
- well as v4l2 mmap
-
----
- libavcodec/Makefile       |  2 +-
- libavcodec/v4l2_buffers.c | 65 ++++++++++++++++++++++++++-------------
- libavcodec/v4l2_buffers.h |  2 ++
- libavcodec/v4l2_m2m.c     |  6 +++-
- libavcodec/v4l2_m2m.h     |  4 +++
- libavcodec/v4l2_m2m_dec.c | 16 ++++++++++
- 6 files changed, 71 insertions(+), 24 deletions(-)
-
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 11f183c9b9..8b1d669834 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -170,7 +170,7 @@ OBJS-$(CONFIG_VP3DSP)                  += vp3dsp.o
- OBJS-$(CONFIG_VP56DSP)                 += vp56dsp.o
- OBJS-$(CONFIG_VP8DSP)                  += vp8dsp.o
- OBJS-$(CONFIG_V4L2_M2M)                += v4l2_m2m.o v4l2_context.o v4l2_buffers.o v4l2_fmt.o\
--                                          weak_link.o
-+                                          weak_link.o v4l2_req_dmabufs.o
- OBJS-$(CONFIG_V4L2_REQUEST)            += v4l2_req_media.o v4l2_req_pollqueue.o v4l2_req_dmabufs.o\
- 					  v4l2_req_devscan.o weak_link.o
- OBJS-$(CONFIG_WMA_FREQS)               += wma_freqs.o
-diff --git a/libavcodec/v4l2_buffers.c b/libavcodec/v4l2_buffers.c
-index e28ef2d1e8..8d80d19788 100644
---- a/libavcodec/v4l2_buffers.c
-+++ b/libavcodec/v4l2_buffers.c
-@@ -36,6 +36,7 @@
- #include "v4l2_context.h"
- #include "v4l2_buffers.h"
- #include "v4l2_m2m.h"
-+#include "v4l2_req_dmabufs.h"
- #include "weak_link.h"
- 
- #define USEC_PER_SEC 1000000
-@@ -477,33 +478,46 @@ static void v4l2_free_bufref(void *opaque, uint8_t *data)
-     av_buffer_unref(&bufref);
- }
- 
-+static inline uint32_t ff_v4l2_buf_len(const struct v4l2_buffer * b, unsigned int i)
-+{
-+    return V4L2_TYPE_IS_MULTIPLANAR(b->type) ? b->m.planes[i].length : b->length;
-+}
-+
- static int v4l2_buffer_export_drm(V4L2Buffer* avbuf)
- {
--    struct v4l2_exportbuffer expbuf;
-     int i, ret;
-+    const V4L2m2mContext * const s = buf_to_m2mctx(avbuf);
- 
-     for (i = 0; i < avbuf->num_planes; i++) {
--        memset(&expbuf, 0, sizeof(expbuf));
--
--        expbuf.index = avbuf->buf.index;
--        expbuf.type = avbuf->buf.type;
--        expbuf.plane = i;
-+        int dma_fd = -1;
-+        const uint32_t blen = ff_v4l2_buf_len(&avbuf->buf, i);
-+
-+        if (s->db_ctl != NULL) {
-+            if ((avbuf->dmabuf[i] = dmabuf_alloc(s->db_ctl, blen)) == NULL)
-+                return AVERROR(ENOMEM);
-+            dma_fd = dmabuf_fd(avbuf->dmabuf[i]);
-+            if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type))
-+                avbuf->buf.m.planes[i].m.fd = dma_fd;
-+            else
-+                avbuf->buf.m.fd = dma_fd;
-+        }
-+        else {
-+            struct v4l2_exportbuffer expbuf;
-+            memset(&expbuf, 0, sizeof(expbuf));
- 
--        ret = ioctl(buf_to_m2mctx(avbuf)->fd, VIDIOC_EXPBUF, &expbuf);
--        if (ret < 0)
--            return AVERROR(errno);
-+            expbuf.index = avbuf->buf.index;
-+            expbuf.type = avbuf->buf.type;
-+            expbuf.plane = i;
- 
--        if (V4L2_TYPE_IS_MULTIPLANAR(avbuf->buf.type)) {
--            /* drm frame */
--            avbuf->drm_frame.objects[i].size = avbuf->buf.m.planes[i].length;
--            avbuf->drm_frame.objects[i].fd = expbuf.fd;
--            avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
--        } else {
--            /* drm frame */
--            avbuf->drm_frame.objects[0].size = avbuf->buf.length;
--            avbuf->drm_frame.objects[0].fd = expbuf.fd;
--            avbuf->drm_frame.objects[0].format_modifier = DRM_FORMAT_MOD_LINEAR;
-+            ret = ioctl(s->fd, VIDIOC_EXPBUF, &expbuf);
-+            if (ret < 0)
-+                return AVERROR(errno);
-+            dma_fd = expbuf.fd;
-         }
-+
-+        avbuf->drm_frame.objects[i].size = blen;
-+        avbuf->drm_frame.objects[i].fd = dma_fd;
-+        avbuf->drm_frame.objects[i].format_modifier = DRM_FORMAT_MOD_LINEAR;
-     }
- 
-     return 0;
-@@ -870,9 +884,16 @@ static void v4l2_buffer_buffer_free(void *opaque, uint8_t *data)
-             munmap(p->mm_addr, p->length);
-     }
- 
--    for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
--        if (avbuf->drm_frame.objects[i].fd != -1)
--            close(avbuf->drm_frame.objects[i].fd);
-+    if (avbuf->dmabuf[0] == NULL) {
-+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->drm_frame.objects); ++i) {
-+            if (avbuf->drm_frame.objects[i].fd != -1)
-+                close(avbuf->drm_frame.objects[i].fd);
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(const int16_t *, uint8_t * ptrdiff_t) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "idctdsp.%s", tests[t].name)) {
++            declare_func_emms(AV_CPU_FLAG_MMX, void, const int16_t *, uint8_t *, ptrdiff_t);
++            RANDOMIZE_BUFFER16(src, 64);
++            RANDOMIZE_BUFFER8(dst, 10 * 24);
++            call_ref(src0, dst0 + 24 + 8, 24);
++            call_new(src1, dst1 + 24 + 8, 24);
++            if (memcmp(dst0, dst1, 10 * 24))
++                fail();
++            bench_new(src1, dst1 + 24 + 8, 24);
 +        }
 +    }
-+    else {
-+        for (i = 0; i != FF_ARRAY_ELEMS(avbuf->dmabuf); ++i) {
-+            dmabuf_free(avbuf->dmabuf[i]);
-+        }
-     }
- 
-     av_buffer_unref(&avbuf->ref_buf);
-diff --git a/libavcodec/v4l2_buffers.h b/libavcodec/v4l2_buffers.h
-index d91d5d1dd0..444ad94b14 100644
---- a/libavcodec/v4l2_buffers.h
-+++ b/libavcodec/v4l2_buffers.h
-@@ -46,6 +46,7 @@ enum V4L2Buffer_status {
-  */
- struct V4L2Context;
- struct ff_weak_link_client;
-+struct dmabuf_h;
- 
- typedef struct V4L2Buffer {
-     /* each buffer needs to have a reference to its context
-@@ -80,6 +81,7 @@ typedef struct V4L2Buffer {
- 
-     enum V4L2Buffer_status status;
- 
-+    struct dmabuf_h * dmabuf[VIDEO_MAX_PLANES]; // If externally alloced dmabufs - stash other info here
- } V4L2Buffer;
- 
- /**
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index ac6bae0dc3..f802687b1b 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -34,6 +34,7 @@
- #include "v4l2_context.h"
- #include "v4l2_fmt.h"
- #include "v4l2_m2m.h"
-+#include "v4l2_req_dmabufs.h"
- 
- static void
- xlat_init(xlat_track_t * const x)
-@@ -75,7 +76,7 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
- 
-     s->capture.done = s->output.done = 0;
-     s->capture.name = "capture";
--    s->capture.buf_mem = V4L2_MEMORY_MMAP;
-+    s->capture.buf_mem = s->db_ctl != NULL ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-     s->output.name = "output";
-     s->output.buf_mem = s->input_drm ? V4L2_MEMORY_DMABUF : V4L2_MEMORY_MMAP;
-     atomic_init(&s->refcount, 0);
-@@ -94,12 +95,14 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
-     if (v4l2_mplane_video(&cap)) {
-         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE;
-         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE;
-+        s->output.format.type = s->output.type;
-         return 0;
-     }
- 
-     if (v4l2_splane_video(&cap)) {
-         s->capture.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
-         s->output.type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
-+        s->output.format.type = s->output.type;
-         return 0;
-     }
- 
-@@ -293,6 +296,7 @@ int ff_v4l2_m2m_codec_end(V4L2m2mPriv *priv)
- 
-     ff_v4l2_context_release(&s->output);
- 
-+    dmabufs_ctl_unref(&s->db_ctl);
-     close(s->fd);
-     s->fd = -1;
- 
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 26a7161042..0f41f94694 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -71,6 +71,8 @@ typedef struct xlat_track_s {
-     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
- } xlat_track_t;
- 
-+struct dmabufs_ctl;
++}
 +
- typedef struct V4L2m2mContext {
-     char devname[PATH_MAX];
-     int fd;
-@@ -124,6 +126,7 @@ typedef struct V4L2m2mContext {
-     /* Quirks */
-     unsigned int quirks;
- 
-+    struct dmabufs_ctl * db_ctl;
- } V4L2m2mContext;
- 
- typedef struct V4L2m2mPriv {
-@@ -134,6 +137,7 @@ typedef struct V4L2m2mPriv {
- 
-     int num_output_buffers;
-     int num_capture_buffers;
-+    const char * dmabuf_alloc;
-     enum AVPixelFormat pix_fmt;
- } V4L2m2mPriv;
- 
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 47b2735f82..4d17057298 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -41,6 +41,7 @@
- #include "v4l2_context.h"
- #include "v4l2_m2m.h"
- #include "v4l2_fmt.h"
-+#include "v4l2_req_dmabufs.h"
- 
- // Pick 64 for max last count - that is >1sec at 60fps
- #define STATS_LAST_COUNT_MAX 64
-@@ -896,6 +897,20 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         s->output_drm = 0;
-     }
- 
-+    s->db_ctl = NULL;
-+    if (priv->dmabuf_alloc != NULL && strcmp(priv->dmabuf_alloc, "v4l2") != 0) {
-+        if (strcmp(priv->dmabuf_alloc, "cma") == 0)
-+            s->db_ctl = dmabufs_ctl_new();
-+        else {
-+            av_log(avctx, AV_LOG_ERROR, "Unknown dmabuf alloc method: '%s'\n", priv->dmabuf_alloc);
-+            return AVERROR(EINVAL);
-+        }
-+        if (!s->db_ctl) {
-+            av_log(avctx, AV_LOG_ERROR, "Can't open dmabuf provider '%s'\n", priv->dmabuf_alloc);
-+            return AVERROR(ENOMEM);
-+        }
-+    }
-+
-     s->device_ref = av_hwdevice_ctx_alloc(AV_HWDEVICE_TYPE_DRM);
-     if (!s->device_ref) {
-         ret = AVERROR(ENOMEM);
-@@ -1000,6 +1015,7 @@ static const AVOption options[] = {
-     { "num_capture_buffers", "Number of buffers in the capture context",
-         OFFSET(num_capture_buffers), AV_OPT_TYPE_INT, {.i64 = 20}, 2, INT_MAX, FLAGS },
-     { "pixel_format", "Pixel format to be used by the decoder", OFFSET(pix_fmt), AV_OPT_TYPE_PIXEL_FMT, {.i64 = AV_PIX_FMT_NONE}, AV_PIX_FMT_NONE, AV_PIX_FMT_NB, FLAGS },
-+    { "dmabuf_alloc", "Dmabuf alloc method", OFFSET(dmabuf_alloc), AV_OPT_TYPE_STRING, {.str = NULL}, 0, 0, FLAGS },
-     { NULL},
- };
- 
-
-From 499bcdc4ed82c737ceab166a07b46e8ed8ccbc88 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Dec 2022 19:05:47 +0000
-Subject: [PATCH 110/136] testfilt: Skeleton of hw filter test code
-
----
- pi-util/testfilt.py | 83 +++++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 83 insertions(+)
- create mode 100755 pi-util/testfilt.py
-
-diff --git a/pi-util/testfilt.py b/pi-util/testfilt.py
-new file mode 100755
-index 0000000000..b322dac0c2
++void checkasm_check_idctdsp(void)
++{
++    check_add_put_clamped();
++    report("idctdsp");
++}
+diff --git a/tests/checkasm/rpi_sand.c b/tests/checkasm/rpi_sand.c
+new file mode 100644
+index 0000000000..0888714c4c
 --- /dev/null
-+++ b/pi-util/testfilt.py
-@@ -0,0 +1,83 @@
-+#!/usr/bin/env python3
++++ b/tests/checkasm/rpi_sand.c
+@@ -0,0 +1,118 @@
++/*
++ * Copyright (c) 2023 John Cox
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
 +
-+import string
-+import os
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
++#include <string.h>
++#include "checkasm.h"
++#include "libavutil/common.h"
++#include "libavutil/rpi_sand_fns.h"
 +
-+class validator:
-+    def __init__(self):
-+        self.ok = False
-+
-+    def isok(self):
-+        return self.ok
-+
-+    def setok(self):
-+        self.ok = True
-+
-+class valid_regex(validator):
-+    def __init__(self, regex):
-+        super().__init__()
-+        self.regex = re.compile(regex)
-+
-+    def scanline(self, line):
-+        if self.isok() or self.regex.search(line):
-+            self.setok()
-+
-+
-+def validate(validators, flog):
-+    for line in flog:
-+        for v in validators:
-+            v.scanline(line)
-+
-+    ok = True
-+    for v in validators:
-+        if not v.isok():
-+            ok = False
-+            # complain
-+            print("Test failed")
-+
-+    if ok:
-+        print("OK")
-+    return ok
-+
-+def runtest(name, ffmpeg, args, suffix, validators):
-+    log_root = os.path.join("/tmp", "testfilt", name)
-+    ofilename = os.path.join(log_root, name + suffix)
-+
-+    if not os.path.exists(log_root):
-+        os.makedirs(log_root)
-+
-+    try:
-+        os.remove(ofilename)
-+    except:
-+        pass
-+
-+    flog = open(os.path.join(log_root, name + ".log"), "wb")
-+    ffargs = [ffmpeg] + args + [ofilename]
-+
-+    subprocess.call(ffargs, stdout=flog, stderr=subprocess.STDOUT, text=False)
-+    flog.close
-+
-+    flog = open(os.path.join(log_root, name + ".log"), "rt")
-+    return validate(validators, flog)
-+
-+def sayok(log_root, flog):
-+    print("Woohoo")
-+    return True
-+
-+if __name__ == '__main__':
-+
-+    argp = argparse.ArgumentParser(description="FFmpeg filter tester")
-+    argp.add_argument("--ffmpeg", default="./ffmpeg", help="ffmpeg exec name")
-+    args = argp.parse_args()
-+
-+    runtest("ATest", args.ffmpeg, ["-v", "verbose", "-no_cvt_hw", "-an", "-c:v", "h264_v4l2m2m", "-i",
-+                                   "/home/johncox/server/TestMedia/Sony/jellyfish-10-mbps-hd-h264.mkv",
-+#                                    "/home/jc/rpi/streams/jellyfish-3-mbps-hd-h264.mkv",
-+                                   "-c:v", "h264_v4l2m2m", "-b:v", "2M"], ".mkv",
-+            [valid_regex(r'Output stream #0:0 \(video\): 900 frames encoded; 900 packets muxed')])
-
-From 50ac318a472fd98e1e58605316ea6a2e8cde0a04 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 5 Jan 2023 14:39:30 +0000
-Subject: [PATCH 111/136] pixfmt: Add a #define to indicate presence of SAND
- formats
-
----
- libavutil/pixfmt.h | 2 ++
- 1 file changed, 2 insertions(+)
-
-diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index 22f70007c3..5cc780e7d5 100644
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -378,6 +378,8 @@ enum AVPixelFormat {
-     AV_PIX_FMT_Y210BE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, big-endian
-     AV_PIX_FMT_Y210LE,    ///< packed YUV 4:2:2 like YUYV422, 20bpp, data in the high bits, little-endian
- // RPI - not on ifdef so can be got at by calling progs
-+// #define so code that uses this can know it is there
-+#define AVUTIL_HAVE_PIX_FMT_SAND 1
-     AV_PIX_FMT_SAND128,    ///< 4:2:0  8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-     AV_PIX_FMT_SAND64_10,  ///< 4:2:0 10-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-     AV_PIX_FMT_SAND64_16,  ///< 4:2:0 16-bit  64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-
-From 23a3132e094d449ea05657704c0cffc3f0762c28 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 11 Jan 2023 16:30:37 +0000
-Subject: [PATCH 112/136] v4l2_m2m_dec: Fix initial pkt send if no extradata
-
----
- libavcodec/v4l2_m2m_dec.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 4d17057298..9daf05adfe 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -240,7 +240,7 @@ copy_extradata(AVCodecContext * const avctx,
-     else
-         len = src_len < 0 ? AVERROR(EINVAL) : src_len;
- 
--    // Zero length is OK but we swant to stop - -ve is error val
-+    // Zero length is OK but we want to stop - -ve is error val
-     if (len <= 0)
-         return len;
- 
-@@ -525,7 +525,7 @@ static int try_enqueue_src(AVCodecContext * const avctx, V4L2m2mContext * const
- 
-     if (s->extdata_sent)
-         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, NULL, 0);
--    else if (s->extdata_data)
-+    else
-         ret = ff_v4l2_context_enqueue_packet(&s->output, &s->buf_pkt, s->extdata_data, s->extdata_size);
- 
-     if (ret == AVERROR(EAGAIN)) {
-
-From f4f6b9f1af137153e574c704804033e83f2ed1a8 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 16 Jan 2023 16:05:09 +0000
-Subject: [PATCH 113/136] v4l2m2m_dec: Make capture timeout long once pending
- count > 31
-
-For some applications (ffmpeg command line) the current heuristic of adding
-a short timeout and preferring DQ over Q once we think we have buffers
-Qed in V4L2 is insufficient to prevent arbitrary buffer growth.
-Unfortunately the current method of guessing the number of Qed buffers isn't
-reliable enough to allow for a long timeout with only a few few buffers
-believed pending so only do so once the number of buffers believed pending
-exceeds plausible inaccuracies caused by buffer reordering.
-
-The limit could be optimised by codec or apparent latency but a simple
-number should reduce the  unexpected consequences.
----
- libavcodec/v4l2_m2m.h     |  3 ++-
- libavcodec/v4l2_m2m_dec.c | 18 ++++++++++++++----
- 2 files changed, 16 insertions(+), 5 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index 0f41f94694..ded1478a49 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -66,7 +66,7 @@ typedef struct pts_stats_s
- 
- typedef struct xlat_track_s {
-     unsigned int track_no;
--    int64_t last_pts;
-+    int64_t last_pts;    // Last valid PTS decoded
-     int64_t last_opaque;
-     V4L2m2mTrackEl track_els[FF_V4L2_M2M_TRACK_SIZE];
- } xlat_track_t;
-@@ -88,6 +88,7 @@ typedef struct V4L2m2mContext {
- 
-     /* null frame/packet received */
-     int draining;
-+    int running;
-     AVPacket buf_pkt;
- 
-     /* Reference to a frame. Only used during encoding */
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 9daf05adfe..c8ab883d7e 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -582,7 +582,7 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- 
-     do {
-         const int pending = xlat_pending(&s->xlat);
--        const int prefer_dq = (pending > 3);
-+        const int prefer_dq = (pending > 4);
-         const int last_src_rv = src_rv;
- 
-         av_log(avctx, AV_LOG_TRACE, "Pending=%d, src_rv=%d, req_pkt=%d\n", pending, src_rv, s->req_pkt);
-@@ -611,10 +611,14 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-         // (b) enqueue returned a status indicating that decode should be attempted
-         if (dst_rv != 0 && TRY_DQ(src_rv)) {
-             // Pick a timeout depending on state
-+            // The pending count isn't completely reliable so it is good enough
-+            // hint that we want a frame but not good enough to require it in
-+            // all cases; however if it has got > 31 that exceeds its margin of
-+            // error so require a frame to prevent ridiculous levels of latency
-             const int t =
-                 src_rv == NQ_Q_FULL ? -1 :
-                 src_rv == NQ_DRAINING ? 300 :
--                prefer_dq ? 5 : 0;
-+                prefer_dq ? (s->running && pending > 31 ? 100 : 5) : 0;
- 
-             // Dequeue frame will unref any previous contents of frame
-             // if it returns success so we don't need an explicit unref
-@@ -631,8 +635,13 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
-                 }
-             }
- 
--            if (dst_rv == 0)
-+            if (dst_rv == 0) {
-                 set_best_effort_pts(avctx, &s->pts_stat, frame);
-+                if (!s->running) {
-+                    s->running = 1;
-+                    av_log(avctx, AV_LOG_VERBOSE, "Decode running\n");
-+                }
-+            }
- 
-             if (dst_rv == AVERROR(EAGAIN) && src_rv == NQ_DRAINING) {
-                 av_log(avctx, AV_LOG_WARNING, "Timeout in drain - assume EOF");
-@@ -998,7 +1007,8 @@ static void v4l2_decode_flush(AVCodecContext *avctx)
- 
-     // resend extradata
-     s->extdata_sent = 0;
--    // clear EOS status vars
-+    // clear status vars
-+    s->running = 0;
-     s->draining = 0;
-     output->done = 0;
-     capture->done = 0;
-
-From 39f49cdaefa4483914f703c3f352c8894b3b81fd Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 6 Feb 2023 19:23:16 +0000
-Subject: [PATCH 114/136] Initial buffersink alloc callback code
-
-(cherry picked from commit dde8d3c8f3cc279b9b92ed4f10a2e3990f4aadeb)
----
- libavfilter/buffersink.c | 44 ++++++++++++++++++++++++++++++++++++++++
- libavfilter/buffersink.h |  3 +++
- 2 files changed, 47 insertions(+)
-
-diff --git a/libavfilter/buffersink.c b/libavfilter/buffersink.c
-index 306c283f77..d3c82aabf3 100644
---- a/libavfilter/buffersink.c
-+++ b/libavfilter/buffersink.c
-@@ -62,6 +62,11 @@ typedef struct BufferSinkContext {
-     int sample_rates_size;
- 
-     AVFrame *peeked_frame;
-+
-+    union {
-+        av_buffersink_alloc_video_frame * video;
-+    } alloc_cb;
-+    void * alloc_v;
- } BufferSinkContext;
- 
- #define NB_ITEMS(list) (list ## _size / sizeof(*list))
-@@ -154,6 +159,44 @@ int attribute_align_arg av_buffersink_get_samples(AVFilterContext *ctx,
-     return get_frame_internal(ctx, frame, 0, nb_samples);
- }
- 
-+static AVFrame * alloc_video_buffer(AVFilterLink *link, int w, int h)
-+{
-+    AVFilterContext * const ctx = link->dst;
-+    BufferSinkContext * const bs = ctx->priv;
-+    return bs->alloc_cb.video ? bs->alloc_cb.video(ctx, bs->alloc_v, w, h) :
-+        ff_default_get_video_buffer(link, w, h);
-+}
-+
-+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v)
-+{
-+    BufferSinkContext * const bs = ctx->priv;
-+    bs->alloc_cb.video = cb;
-+    bs->alloc_v = v;
-+    return 0;
-+}
-+
-+#if FF_API_BUFFERSINK_ALLOC
-+AVBufferSinkParams *av_buffersink_params_alloc(void)
-+{
-+    static const int pixel_fmts[] = { AV_PIX_FMT_NONE };
-+    AVBufferSinkParams *params = av_malloc(sizeof(AVBufferSinkParams));
-+    if (!params)
-+        return NULL;
-+
-+    params->pixel_fmts = pixel_fmts;
-+    return params;
-+}
-+
-+AVABufferSinkParams *av_abuffersink_params_alloc(void)
-+{
-+    AVABufferSinkParams *params = av_mallocz(sizeof(AVABufferSinkParams));
-+
-+    if (!params)
-+        return NULL;
-+    return params;
-+}
++#if ARCH_ARM
++#include "libavutil/arm/cpu.h"
++#include "libavutil/arm/rpi_sand_neon.h"
++#elif ARCH_AARCH64
++#include "libavutil/aarch64/cpu.h"
++#include "libavutil/aarch64/rpi_sand_neon.h"
 +#endif
 +
- static av_cold int common_init(AVFilterContext *ctx)
- {
-     BufferSinkContext *buf = ctx->priv;
-@@ -381,6 +424,7 @@ static const AVFilterPad avfilter_vsink_buffer_inputs[] = {
-     {
-         .name = "default",
-         .type = AVMEDIA_TYPE_VIDEO,
-+        .get_buffer = {.video = alloc_video_buffer},
-     },
- };
- 
-diff --git a/libavfilter/buffersink.h b/libavfilter/buffersink.h
-index 64e08de53e..09737d322f 100644
---- a/libavfilter/buffersink.h
-+++ b/libavfilter/buffersink.h
-@@ -166,6 +166,9 @@ int av_buffersink_get_frame(AVFilterContext *ctx, AVFrame *frame);
-  */
- int av_buffersink_get_samples(AVFilterContext *ctx, AVFrame *frame, int nb_samples);
- 
-+typedef AVFrame * av_buffersink_alloc_video_frame(AVFilterContext * ctx, void * v, int w, int h);
-+int av_buffersink_set_alloc_video_frame(AVFilterContext *ctx, av_buffersink_alloc_video_frame * cb, void * v);
-+
- /**
-  * @}
-  */
-
-From a63ae21e74ae48f1aedac53c18142b7596d041ad Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 30 Jan 2023 17:23:12 +0000
-Subject: [PATCH 115/136] v4l2_m2m_dec: Add a profile check
-
-Check the profile in avctx aginst what the v4l2 driver advertises. If
-the driver doesn't support the check then just accept anything.
-
-(cherry picked from commit 6dd83dead9ebce419fdea152db0c9f5e9a94e9ef)
----
- libavcodec/v4l2_m2m_dec.c | 125 ++++++++++++++++++++++++++++++++++++++
- 1 file changed, 125 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index c8ab883d7e..098adf4821 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -715,6 +715,127 @@ static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- }
- #endif
- 
-+static uint32_t
-+avprofile_to_v4l2(const enum AVCodecID codec_id, const int avprofile)
++static inline uint32_t pack30(unsigned int a, unsigned int b, unsigned int c)
 +{
-+    switch (codec_id) {
-+        case AV_CODEC_ID_H264:
-+            switch (avprofile) {
-+                case FF_PROFILE_H264_BASELINE:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_BASELINE;
-+                case FF_PROFILE_H264_CONSTRAINED_BASELINE:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_BASELINE;
-+                case FF_PROFILE_H264_MAIN:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_MAIN;
-+                case FF_PROFILE_H264_EXTENDED:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_EXTENDED;
-+                case FF_PROFILE_H264_HIGH:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH;
-+                case FF_PROFILE_H264_HIGH_10:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10;
-+                case FF_PROFILE_H264_HIGH_10_INTRA:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_10_INTRA;
-+                case FF_PROFILE_H264_MULTIVIEW_HIGH:
-+                case FF_PROFILE_H264_HIGH_422:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422;
-+                case FF_PROFILE_H264_HIGH_422_INTRA:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_422_INTRA;
-+                case FF_PROFILE_H264_STEREO_HIGH:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_STEREO_HIGH;
-+                case FF_PROFILE_H264_HIGH_444_PREDICTIVE:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_PREDICTIVE;
-+                case FF_PROFILE_H264_HIGH_444_INTRA:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_HIGH_444_INTRA;
-+                case FF_PROFILE_H264_CAVLC_444:
-+                    return V4L2_MPEG_VIDEO_H264_PROFILE_CAVLC_444_INTRA;
-+                case FF_PROFILE_H264_HIGH_444:
-+                default:
-+                    break;
-+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_BASELINE		= 12,
-+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH		= 13,
-+//                    V4L2_MPEG_VIDEO_H264_PROFILE_SCALABLE_HIGH_INTRA	= 14,
-+//                    V4L2_MPEG_VIDEO_H264_PROFILE_MULTIVIEW_HIGH		= 16,
-+//                    V4L2_MPEG_VIDEO_H264_PROFILE_CONSTRAINED_HIGH		= 17,
-+            }
-+            break;
-+        case AV_CODEC_ID_MPEG2VIDEO:
-+        case AV_CODEC_ID_MPEG4:
-+        case AV_CODEC_ID_VC1:
-+        case AV_CODEC_ID_VP8:
-+        case AV_CODEC_ID_VP9:
-+        case AV_CODEC_ID_AV1:
-+            // Most profiles are a simple number that matches the V4L2 enum
-+            return avprofile;
-+        default:
-+            break;
-+    }
-+    return ~(uint32_t)0;
++    return (a & 0x3ff) | ((b & 0x3ff) << 10) | ((c & 0x3ff) << 20);
 +}
 +
-+// This check mirrors Chrome's profile check by testing to see if the profile
-+// exists as a possible value for the V4L2 profile control
-+static int
-+check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
++void checkasm_check_rpi_sand(void)
 +{
-+    struct v4l2_queryctrl query_ctrl;
-+    struct v4l2_querymenu query_menu;
-+    uint32_t profile_id;
++    const unsigned int w = 1280;
++    const unsigned int h = 66;
++    const unsigned int stride1 = 128;
++    const unsigned int stride2 = h*3/2;
++    const unsigned int ssize = ((w+95)/96)*128*h*3/2;
++    const unsigned int ysize = ((w + 32) * (h + 32) * 2);
 +
-+    // An unset profile is almost certainly zero or -99 - do not reject
-+    if (avctx->profile <= 0) {
-+        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
-+        return 0;
++    uint8_t * sbuf0 = malloc(ssize);
++    uint8_t * sbuf1 = malloc(ssize);
++    uint8_t * ybuf0 = malloc(ysize);
++    uint8_t * ybuf1 = malloc(ysize);
++    uint8_t * vbuf0 = malloc(ysize);
++    uint8_t * vbuf1 = malloc(ysize);
++    uint8_t * yframe0 = (w + 32) * 16 + ybuf0;
++    uint8_t * yframe1 = (w + 32) * 16 + ybuf1;
++    uint8_t * vframe0 = (w + 32) * 16 + vbuf0;
++    uint8_t * vframe1 = (w + 32) * 16 + vbuf1;
++    unsigned int i;
++
++    for (i = 0; i != ssize; i += 4)
++        *(uint32_t*)(sbuf0 + i) = rnd();
++    memcpy(sbuf1, sbuf0, ssize);
++
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_y16 : av_rpi_sand30_to_planar_y16, "rpi_sand30_to_planar_y16")) {
++        declare_func(void, uint8_t * dst, const unsigned int dst_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32) * 2, sbuf0, stride1, stride2, 0, 0, w, h);
++        call_new(yframe1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize))
++            fail();
++
++        bench_new(ybuf1, (w + 32) * 2, sbuf1, stride1, stride2, 0, 0, w, h);
 +    }
 +
-+    memset(&query_ctrl, 0, sizeof(query_ctrl));
-+    switch (avctx->codec_id) {
-+        case AV_CODEC_ID_MPEG2VIDEO:
-+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG2_PROFILE;
-+            break;
-+        case AV_CODEC_ID_MPEG4:
-+            profile_id = V4L2_CID_MPEG_VIDEO_MPEG4_PROFILE;
-+            break;
-+        case AV_CODEC_ID_H264:
-+            profile_id = V4L2_CID_MPEG_VIDEO_H264_PROFILE;
-+            break;
-+        case AV_CODEC_ID_VP8:
-+            profile_id = V4L2_CID_MPEG_VIDEO_VP8_PROFILE;
-+            break;
-+        case AV_CODEC_ID_VP9:
-+            profile_id = V4L2_CID_MPEG_VIDEO_VP9_PROFILE;
-+            break;
-+#ifdef V4L2_CID_MPEG_VIDEO_AV1_PROFILE
-+        case AV_CODEC_ID_AV1:
-+            profile_id = V4L2_CID_MPEG_VIDEO_AV1_PROFILE;
-+            break;
-+#endif
-+        default:
-+            av_log(avctx, AV_LOG_VERBOSE, "Can't map profile for codec id %d; profile check skipped\n", avctx->codec_id);
-+            return 0;
++    if (check_func(have_neon(av_get_cpu_flags()) ? ff_rpi_sand30_lines_to_planar_c16 : av_rpi_sand30_to_planar_c16, "rpi_sand30_to_planar_c16")) {
++        declare_func(void, uint8_t * u_dst, const unsigned int u_stride,
++                     uint8_t * v_dst, const unsigned int v_stride,
++                     const uint8_t * src,
++                     unsigned int stride1, unsigned int stride2,
++                     unsigned int _x, unsigned int y,
++                     unsigned int _w, unsigned int h);
++
++        memset(ybuf0, 0xbb, ysize);
++        memset(ybuf1, 0xbb, ysize);
++        memset(vbuf0, 0xbb, ysize);
++        memset(vbuf1, 0xbb, ysize);
++
++        call_ref(yframe0, (w + 32), vframe0, (w + 32), sbuf0, stride1, stride2, 0, 0, w/2, h/2);
++        call_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
++
++        if (memcmp(sbuf0, sbuf1, ssize)
++            || memcmp(ybuf0, ybuf1, ysize)
++            || memcmp(vbuf0, vbuf1, ysize))
++            fail();
++
++        bench_new(yframe1, (w + 32), vframe1, (w + 32), sbuf1, stride1, stride2, 0, 0, w/2, h/2);
 +    }
 +
-+    query_ctrl = (struct v4l2_queryctrl){.id = profile_id};
-+    if (ioctl(s->fd, VIDIOC_QUERYCTRL, &query_ctrl) != 0) {
-+        av_log(avctx, AV_LOG_VERBOSE, "Query profile ctrl (%#x) not supported: assume OK\n", query_ctrl.id);
++
++    report("sand30");
++
++    free(sbuf0);
++    free(sbuf1);
++    free(ybuf0);
++    free(ybuf1);
++    free(vbuf0);
++    free(vbuf1);
++}
++
+diff --git a/tests/checkasm/vc1dsp.c b/tests/checkasm/vc1dsp.c
+new file mode 100644
+index 0000000000..52628d15e4
+--- /dev/null
++++ b/tests/checkasm/vc1dsp.c
+@@ -0,0 +1,452 @@
++/*
++ * Copyright (c) 2022 Ben Avison
++ *
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++
++#include "checkasm.h"
++
++#include "libavcodec/vc1dsp.h"
++
++#include "libavutil/common.h"
++#include "libavutil/internal.h"
++#include "libavutil/intreadwrite.h"
++#include "libavutil/mem_internal.h"
++
++#define VC1DSP_TEST(func) { #func, offsetof(VC1DSPContext, func) },
++#define VC1DSP_SIZED_TEST(func, width, height) { #func, offsetof(VC1DSPContext, func), width, height },
++
++typedef struct {
++    const char *name;
++    size_t offset;
++    int width;
++    int height;
++} test;
++
++typedef struct matrix {
++    size_t width;
++    size_t height;
++    float d[];
++} matrix;
++
++static const matrix T8 = { 8, 8, {
++        12,  12,  12,  12,  12,  12,  12,  12,
++        16,  15,   9,   4,  -4,  -9, -15, -16,
++        16,   6,  -6, -16, -16,  -6,   6,  16,
++        15,  -4, -16,  -9,   9,  16,   4, -15,
++        12, -12, -12,  12,  12, -12, -12,  12,
++         9, -16,   4,  15, -15,  -4,  16,  -9,
++         6, -16,  16,  -6,  -6,  16, -16,   6,
++         4,  -9,  15, -16,  16, -15,   9,  -4
++} };
++
++static const matrix T4 = { 4, 4, {
++        17,  17,  17,  17,
++        22,  10, -10, -22,
++        17, -17, -17,  17,
++        10, -22,  22, -10
++} };
++
++static const matrix T8t = { 8, 8, {
++        12,  16,  16,  15,  12,   9,   6,   4,
++        12,  15,   6,  -4, -12, -16, -16,  -9,
++        12,   9,  -6, -16, -12,   4,  16,  15,
++        12,   4, -16,  -9,  12,  15,  -6, -16,
++        12,  -4, -16,   9,  12, -15,  -6,  16,
++        12,  -9,  -6,  16, -12,  -4,  16, -15,
++        12, -15,   6,   4, -12,  16, -16,   9,
++        12, -16,  16, -15,  12,  -9,   6,  -4
++} };
++
++static const matrix T4t = { 4, 4, {
++        17,  22,  17,  10,
++        17,  10, -17, -22,
++        17, -10, -17,  22,
++        17, -22,  17, -10
++} };
++
++static matrix *new_matrix(size_t width, size_t height)
++{
++    matrix *out = av_mallocz(sizeof (matrix) + height * width * sizeof (float));
++    if (out == NULL) {
++        fprintf(stderr, "Memory allocation failure\n");
++        exit(EXIT_FAILURE);
 +    }
-+    else {
-+        av_log(avctx, AV_LOG_DEBUG, "%s: Control supported: %#x\n", __func__, query_ctrl.id);
++    out->width = width;
++    out->height = height;
++    return out;
++}
 +
-+        query_menu = (struct v4l2_querymenu){
-+            .id = query_ctrl.id,
-+            .index = avprofile_to_v4l2(avctx->codec_id, avctx->profile),
-+        };
-+
-+        if (query_menu.index > query_ctrl.maximum ||
-+            query_menu.index < query_ctrl.minimum ||
-+            ioctl(s->fd, VIDIOC_QUERYMENU, &query_menu) != 0) {
-+            return AVERROR(ENOENT);
-+        }
++static matrix *multiply(const matrix *a, const matrix *b)
++{
++    matrix *out;
++    if (a->width != b->height) {
++        fprintf(stderr, "Incompatible multiplication\n");
++        exit(EXIT_FAILURE);
 +    }
-+
-+    return 0;
-+};
-+
- static int
- check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
- {
-@@ -955,6 +1076,10 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-     if ((ret = check_size(avctx, s)) != 0)
-         return ret;
- 
-+    if ((ret = check_profile(avctx, s)) != 0) {
-+        av_log(avctx, AV_LOG_WARNING, "Profile %d not supported by decode\n", avctx->profile);
-+        return ret;
-+    }
-     return 0;
- }
- 
-
-From f734a6ead04a8381fccfae53066866a02a9516d2 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 1 Feb 2023 17:24:39 +0000
-Subject: [PATCH 116/136] v4l2_m2m_dec: Add extradata parse for h264 & hevc
-
-If we have extradata we can extract profile & level and potentailly
-other useful info from it. Use the codec parser to get it if the decoder
-is configured.
-
-(cherry picked from commit 6d431e79adeb246c2ed8cebce9011d81175a3906)
----
- libavcodec/v4l2_m2m_dec.c | 84 ++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 83 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 098adf4821..e64bc707d3 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -21,6 +21,8 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "config.h"
-+
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
- 
-@@ -43,6 +45,13 @@
- #include "v4l2_fmt.h"
- #include "v4l2_req_dmabufs.h"
- 
-+#if CONFIG_H264_DECODER
-+#include "h264_parse.h"
-+#endif
-+#if CONFIG_HEVC_DECODER
-+#include "hevc_parse.h"
-+#endif
-+
- // Pick 64 for max last count - that is >1sec at 60fps
- #define STATS_LAST_COUNT_MAX 64
- #define STATS_INTERVAL_MAX (1 << 30)
-@@ -956,6 +965,78 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
-     return size + (1 << 16);
- }
- 
-+static void
-+parse_extradata(AVCodecContext *avctx)
-+{
-+    if (!avctx->extradata || !avctx->extradata_size)
-+        return;
-+
-+    switch (avctx->codec_id) {
-+#if CONFIG_H264_DECODER
-+        case AV_CODEC_ID_H264:
-+        {
-+            H264ParamSets ps = {{NULL}};
-+            int is_avc = 0;
-+            int nal_length_size = 0;
-+            int ret;
-+
-+            ret = ff_h264_decode_extradata(avctx->extradata, avctx->extradata_size,
-+                                           &ps, &is_avc, &nal_length_size,
-+                                           avctx->err_recognition, avctx);
-+            if (ret > 0) {
-+                const SPS * sps = NULL;
-+                unsigned int i;
-+                for (i = 0; i != MAX_SPS_COUNT; ++i) {
-+                    if (ps.sps_list[i]) {
-+                        sps = (const SPS *)ps.sps_list[i]->data;
-+                        break;
-+                    }
-+                }
-+                if (sps) {
-+                    avctx->profile = ff_h264_get_profile(sps);
-+                    avctx->level = sps->level_idc;
-+                }
-+            }
-+            ff_h264_ps_uninit(&ps);
-+            break;
++    out = new_matrix(b->width, a->height);
++    for (int j = 0; j < out->height; ++j)
++        for (int i = 0; i < out->width; ++i) {
++            float sum = 0;
++            for (int k = 0; k < a->width; ++k)
++                sum += a->d[j * a->width + k] * b->d[k * b->width + i];
++            out->d[j * out->width + i] = sum;
 +        }
-+#endif
-+#if CONFIG_HEVC_DECODER
-+        case AV_CODEC_ID_HEVC:
-+        {
-+            HEVCParamSets ps = {{NULL}};
-+            HEVCSEI sei = {{{{0}}}};
-+            int is_nalff = 0;
-+            int nal_length_size = 0;
-+            int ret;
-+
-+            ret = ff_hevc_decode_extradata(avctx->extradata, avctx->extradata_size,
-+                                           &ps, &sei, &is_nalff, &nal_length_size,
-+                                           avctx->err_recognition, 0, avctx);
-+            if (ret > 0) {
-+                const HEVCSPS * sps = NULL;
-+                unsigned int i;
-+                for (i = 0; i != HEVC_MAX_SPS_COUNT; ++i) {
-+                    if (ps.sps_list[i]) {
-+                        sps = (const HEVCSPS *)ps.sps_list[i]->data;
-+                        break;
-+                    }
-+                }
-+                if (sps) {
-+                    avctx->profile = sps->ptl.general_ptl.profile_idc;
-+                    avctx->level   = sps->ptl.general_ptl.level_idc;
-+                }
-+            }
-+            ff_hevc_ps_uninit(&ps);
-+            ff_hevc_reset_sei(&sei);
-+            break;
-+        }
-+#endif
-+        default:
-+            break;
-+    }
++    return out;
 +}
 +
- static av_cold int v4l2_decode_init(AVCodecContext *avctx)
- {
-     V4L2Context *capture, *output;
-@@ -976,7 +1057,8 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         avctx->ticks_per_frame = 2;
-     }
- 
--    av_log(avctx, AV_LOG_INFO, "level=%d\n", avctx->level);
-+    parse_extradata(avctx);
-+
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
-
-From e28421e397743a94f5e37327ad234f59b6ae613d Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 20 Mar 2023 18:12:51 +0000
-Subject: [PATCH 117/136] clean_usr_libs: Now wipes the include files too
-
-When swapping ffmpeg versions obsolete makefiles could confuse
-configure utilities.
----
- pi-util/clean_usr_libs.sh | 16 ++++++++++++++++
- 1 file changed, 16 insertions(+)
-
-diff --git a/pi-util/clean_usr_libs.sh b/pi-util/clean_usr_libs.sh
-index b3b2d5509d..01bd6a6a22 100755
---- a/pi-util/clean_usr_libs.sh
-+++ b/pi-util/clean_usr_libs.sh
-@@ -1,4 +1,20 @@
- set -e
-+U=/usr/include/arm-linux-gnueabihf
-+rm -rf $U/libavcodec
-+rm -rf $U/libavdevice
-+rm -rf $U/libavfilter
-+rm -rf $U/libavformat
-+rm -rf $U/libavutil
-+rm -rf $U/libswresample
-+rm -rf $U/libswscale
-+U=/usr/include/aarch64-linux-gnu
-+rm -rf $U/libavcodec
-+rm -rf $U/libavdevice
-+rm -rf $U/libavfilter
-+rm -rf $U/libavformat
-+rm -rf $U/libavutil
-+rm -rf $U/libswresample
-+rm -rf $U/libswscale
- U=/usr/lib/arm-linux-gnueabihf
- rm -f $U/libavcodec.*
- rm -f $U/libavdevice.*
-
-From dcabd30310b88b45359609bac27d5d0f9bbc6dc1 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 20 Mar 2023 18:15:08 +0000
-Subject: [PATCH 118/136] vulkan: Add missing decode extension defines
-
-When building on bookworm the video decode extension names
-were missing. This adds them. I expect this patch will be
-obsolete shortly but it solves a current problem.
----
- libavutil/hwcontext_vulkan.c | 8 ++++++++
- 1 file changed, 8 insertions(+)
-
-diff --git a/libavutil/hwcontext_vulkan.c b/libavutil/hwcontext_vulkan.c
-index 2a9b5f4aac..11e7945f18 100644
---- a/libavutil/hwcontext_vulkan.c
-+++ b/libavutil/hwcontext_vulkan.c
-@@ -57,6 +57,14 @@
- #define CHECK_CU(x) FF_CUDA_CHECK_DL(cuda_cu, cu, x)
- #endif
- 
-+// Sometimes missing definitions
-+#ifndef VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME
-+#define VK_EXT_VIDEO_DECODE_H264_EXTENSION_NAME "VK_EXT_video_decode_h264"
-+#endif
-+#ifndef VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME
-+#define VK_EXT_VIDEO_DECODE_H265_EXTENSION_NAME "VK_EXT_video_decode_h265"
-+#endif
-+
- typedef struct VulkanQueueCtx {
-     VkFence fence;
-     VkQueue queue;
-
-From 0231c208843a5badc799590eb5b9de907d1c26b2 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 21 Mar 2023 14:20:05 +0000
-Subject: [PATCH 119/136] v4l2_m2m_dec: Fix config file for finding if decoder
- enabled
-
-Fixes parsing of extradata for profile testing. 5.x changed where that
-info is defined.
----
- libavcodec/v4l2_m2m_dec.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index e64bc707d3..91136f03da 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -21,7 +21,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
--#include "config.h"
-+#include "config_components.h"
- 
- #include <linux/videodev2.h>
- #include <sys/ioctl.h>
-
-From 822baefed69372b3380144ab44226e2c6ad3e298 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 21 Mar 2023 14:23:20 +0000
-Subject: [PATCH 120/136] v4l2_m2m_dec: Display profile given if skipped in
- debug
-
----
- libavcodec/v4l2_m2m_dec.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 91136f03da..d124c7b1fc 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -792,7 +792,7 @@ check_profile(AVCodecContext *const avctx, V4L2m2mContext *const s)
- 
-     // An unset profile is almost certainly zero or -99 - do not reject
-     if (avctx->profile <= 0) {
--        av_log(avctx, AV_LOG_VERBOSE, "Profile <= 0 - check skipped\n");
-+        av_log(avctx, AV_LOG_VERBOSE, "Profile %d <= 0 - check skipped\n", avctx->profile);
-         return 0;
-     }
- 
-
-From 6859fc2a8791c0fcc25851b77fed15a691ceb332 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 22 Mar 2023 16:08:08 +0000
-Subject: [PATCH 121/136] conf_native: Fix for 64-bit kernel with 32-bit
- userspace
-
-(cherry picked from commit 5bb1e09cea95b4215c6904b9b1a726e83bc5d327)
----
- pi-util/conf_native.sh | 32 +++++++++++++++++++++-----------
- 1 file changed, 21 insertions(+), 11 deletions(-)
-
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-index 082d9b5832..0a7d230f1b 100755
---- a/pi-util/conf_native.sh
-+++ b/pi-util/conf_native.sh
-@@ -33,18 +33,28 @@ RPI_LIBDIRS=
- RPI_DEFINES=
- RPI_EXTRALIBS=
- 
--if [ "$MC" == "arm64" ]; then
--  echo "M/C aarch64"
--  A=aarch64-linux-gnu
--  B=arm64
--elif [ "$MC" == "armhf" ]; then
--  echo "M/C armv7"
--  A=arm-linux-gnueabihf
--  B=armv7
--  MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
--  RPI_DEFINES=-mfpu=neon-vfpv4
-+# uname -m gives kernel type which may not have the same
-+# 32/64bitness as userspace :-( getconf shoudl provide the answer
-+# but use uname to check we are on the right processor
-+MC=`uname -m`
-+LB=`getconf LONG_BIT`
-+if [ "$MC" == "armv7l" ] || [ "$MC" == "aarch64" ]; then
-+  if [ "$LB" == "32" ]; then
-+    echo "M/C armv7"
-+    A=arm-linux-gnueabihf
-+    B=armv7
-+    MCOPTS="--arch=armv6t2 --cpu=cortex-a7"
-+    RPI_DEFINES=-mfpu=neon-vfpv4
-+  elif [ "$LB" == "64" ]; then
-+    echo "M/C aarch64"
-+    A=aarch64-linux-gnu
-+    B=arm64
-+  else
-+    echo "Unknown LONG_BIT name: $LB"
-+    exit 1
-+  fi
- else
--  echo Unexpected architecture $MC
-+  echo "Unknown machine name: $MC"
-   exit 1
- fi
- 
-
-From c35f074854a922c0c025159ddddd1abfc562a3d2 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 20 Apr 2023 11:48:25 +0000
-Subject: [PATCH 122/136] conf_native: Add install prefix variation
-
-(cherry picked from commit 73c3019b534cb8f4b4e4c21995653f6ce440086d)
----
- pi-util/BUILD.txt      | 32 ++++++++++++++++++++------------
- pi-util/conf_native.sh | 14 ++++++++++++--
- 2 files changed, 32 insertions(+), 14 deletions(-)
-
-diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
-index b050971f63..2b62d660c0 100644
---- a/pi-util/BUILD.txt
-+++ b/pi-util/BUILD.txt
-@@ -24,6 +24,8 @@ There are a few choices here
-          paths being confused and therefore running the wrong code,  Shared
-          is what is needed, in most cases, when building for use by other
-          programs.
-+ --usr   Set install dir to /usr (i.e. system default) rather than in
-+         <builddir>/install
- 
- So for a static build
- ---------------------
-@@ -37,23 +39,29 @@ You can now run ffmpeg directly from where it was built
- For a shared build
- ------------------
- 
--$ pi-util/conf_native.sh
--
--You will normally want an install target if shared. Note that the script has
--set this up to be generated in out/<builddir>/install, you don't have to worry
--about overwriting your system libs.
-+There are two choices here
- 
-+$ pi-util/conf_native.sh
- $ make -j8 -C out/<builddir> install
- 
--You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
--built or install the image on the system - you have to be careful to get rid
--of all other ffmpeg libs or confusion may result.  There is a little script
--that wipes all other versions - obviously use with care!
-+This sets the install prefix to <builddir>/install and is probably what you
-+want if you don't want to overwrite the system files.
- 
--$ sudo pi-util/clean_usr_libs.sh
-+You can now set LD_LIBRARY_PATH appropriately and run ffmpeg from where it was
-+built. You can copy the contents of <build dir>/install to /usr and that mostly
-+works. The only downside is that paths in pkgconfig end up being set to the
-+install directory in your build directory which may be less than ideal when
-+building other packages.
- 
--Then simply copying from the install to /usr works
-+The alternative if you just want to replace the system libs is:
- 
--$ sudo cp -r out/<builddir>/install/* /usr
-+$ pi-util/conf_native.sh --usr
-+$ make -j8 -C out/<builddir>
-+$ sudo pi-util/clean_usr_libs.sh
-+$ sudo make -j8 -C out/<builddir> install
- 
-+The clean_usr_libs.sh step wipes any existing libs & includes (for all
-+architectures) from the system which helps avoid confusion when running other
-+progs as you can be sure you're not running old code which is unfortunately
-+easy to do otherwise.
- 
-diff --git a/pi-util/conf_native.sh b/pi-util/conf_native.sh
-index 0a7d230f1b..f0ed159594 100755
---- a/pi-util/conf_native.sh
-+++ b/pi-util/conf_native.sh
-@@ -9,6 +9,7 @@ RPI_KEEPS=""
- 
- NOSHARED=
- MMAL=
-+USR_PREFIX=
- 
- while [ "$1" != "" ] ; do
-     case $1 in
-@@ -18,8 +19,14 @@ while [ "$1" != "" ] ; do
- 	--mmal)
- 	    MMAL=1
- 	    ;;
-+	--usr)
-+	    USR_PREFIX=/usr
-+	    ;;
- 	*)
--	    echo "Usage $0: [--noshared] [--mmal]"
-+	    echo "Usage $0: [--noshared] [--mmal] [--usr]"
-+	    echo "  noshared  Build static libs and executable - good for testing"
-+	    echo "  mmal      Build mmal decoders"
-+	    echo "  usr       Set install prefix to /usr [default=<build-dir>/install]"
- 	    exit 1
- 	    ;;
-     esac
-@@ -82,7 +89,9 @@ else
-   OUT=$BUILDBASE/$B-$C-$V-shared-rel
- fi
- 
--USR_PREFIX=$OUT/install
-+if [ ! $USR_PREFIX ]; then
-+  USR_PREFIX=$OUT/install
-+fi
- LIB_PREFIX=$USR_PREFIX/lib/$A
- INC_PREFIX=$USR_PREFIX/include/$A
- 
-@@ -113,6 +122,7 @@ $FFSRC/configure \
-  --extra-libs="$RPI_EXTRALIBS"\
-  --extra-version="rpi"
- 
-+echo "Configured into $OUT"
- 
- # gcc option for getting asm listing
- # -Wa,-ahls
-
-From 91ea652a95370a428f1353932b2a55dae7158acc Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 19 Apr 2023 10:47:58 +0000
-Subject: [PATCH 123/136] swcale: Add explicit bgr24->yv12 conversion
-
-(cherry picked from commit 9a22d429f46a038321c66a0cd54737177641b434)
----
- libswscale/rgb2rgb.c          |  5 +++++
- libswscale/rgb2rgb.h          |  7 +++++++
- libswscale/rgb2rgb_template.c | 36 ++++++++++++++++++++++++++++++-----
- libswscale/swscale_unscaled.c | 22 +++++++++++++++++++++
- 4 files changed, 65 insertions(+), 5 deletions(-)
-
-diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
-index e98fdac8ea..84bb56e60e 100644
---- a/libswscale/rgb2rgb.c
-+++ b/libswscale/rgb2rgb.c
-@@ -83,6 +83,11 @@ void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
-                        int width, int height,
-                        int lumStride, int chromStride, int srcStride,
-                        int32_t *rgb2yuv);
-+void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
-+                       uint8_t *udst, uint8_t *vdst,
-+                       int width, int height,
-+                       int lumStride, int chromStride, int srcStride,
-+                       int32_t *rgb2yuv);
- void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
-                  int srcStride, int dstStride);
- void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
-diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
-index f3951d523e..0028ab345f 100644
---- a/libswscale/rgb2rgb.h
-+++ b/libswscale/rgb2rgb.h
-@@ -79,6 +79,9 @@ void    rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
- void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                       uint8_t *vdst, int width, int height, int lumStride,
-                       int chromStride, int srcStride, int32_t *rgb2yuv);
-+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                      uint8_t *vdst, int width, int height, int lumStride,
-+                      int chromStride, int srcStride, int32_t *rgb2yuv);
- 
- /**
-  * Height should be a multiple of 2 and width should be a multiple of 16.
-@@ -128,6 +131,10 @@ extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                               int width, int height,
-                               int lumStride, int chromStride, int srcStride,
-                               int32_t *rgb2yuv);
-+extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-+                              int width, int height,
-+                              int lumStride, int chromStride, int srcStride,
-+                              int32_t *rgb2yuv);
- extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
-                         int srcStride, int dstStride);
- 
-diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
-index 42c69801ba..e2437826dd 100644
---- a/libswscale/rgb2rgb_template.c
-+++ b/libswscale/rgb2rgb_template.c
-@@ -646,13 +646,14 @@ static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
-  * others are ignored in the C version.
-  * FIXME: Write HQ version.
-  */
--void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                    uint8_t *vdst, int width, int height, int lumStride,
--                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+                   int chromStride, int srcStride, int32_t *rgb2yuv,
-+                   const uint8_t x[9])
- {
--    int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
--    int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
--    int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
-+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
-+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
-+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
-     int y;
-     const int chromWidth = width >> 1;
- 
-@@ -707,6 +708,30 @@ void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-     }
- }
- 
-+void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
++static void normalise(matrix *a)
 +{
-+    static const uint8_t x[9] = {
-+        RY_IDX, GY_IDX, BY_IDX,
-+        RU_IDX, GU_IDX, BU_IDX,
-+        RV_IDX, GV_IDX, BV_IDX,
-+    };
-+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
-+}
-+
-+void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    static const uint8_t x[9] = {
-+         BY_IDX, GY_IDX, RY_IDX,
-+         BU_IDX, GU_IDX, RU_IDX,
-+         BV_IDX, GV_IDX, RV_IDX,
-+    };
-+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
-+}
-+
- static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
-                               uint8_t *dest, int width, int height,
-                               int src1Stride, int src2Stride, int dstStride)
-@@ -980,6 +1005,7 @@ static av_cold void rgb2rgb_init_c(void)
-     yuy2toyv12         = yuy2toyv12_c;
-     planar2x           = planar2x_c;
-     ff_rgb24toyv12     = ff_rgb24toyv12_c;
-+    ff_bgr24toyv12     = ff_bgr24toyv12_c;
-     interleaveBytes    = interleaveBytes_c;
-     deinterleaveBytes  = deinterleaveBytes_c;
-     vu9_to_vu12        = vu9_to_vu12_c;
-diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
-index 9af2e7ecc3..9047030ae4 100644
---- a/libswscale/swscale_unscaled.c
-+++ b/libswscale/swscale_unscaled.c
-@@ -1654,6 +1654,23 @@ static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-     return srcSliceH;
- }
- 
-+static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-+                              int srcStride[], int srcSliceY, int srcSliceH,
-+                              uint8_t *dst[], int dstStride[])
-+{
-+    ff_bgr24toyv12(
-+        src[0],
-+        dst[0] +  srcSliceY       * dstStride[0],
-+        dst[1] + (srcSliceY >> 1) * dstStride[1],
-+        dst[2] + (srcSliceY >> 1) * dstStride[2],
-+        c->srcW, srcSliceH,
-+        dstStride[0], dstStride[1], srcStride[0],
-+        c->input_rgb2yuv_table);
-+    if (dst[3])
-+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
-+    return srcSliceH;
-+}
-+
- static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-                              int srcStride[], int srcSliceY, int srcSliceH,
-                              uint8_t *dst[], int dstStride[])
-@@ -2037,6 +2054,11 @@ void ff_get_unscaled_swscale(SwsContext *c)
-         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-         c->convert_unscaled = bgr24ToYv12Wrapper;
-+    /* rgb24toYV12 */
-+    if (srcFormat == AV_PIX_FMT_RGB24 &&
-+        (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
-+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        c->convert_unscaled = rgb24ToYv12Wrapper;
- 
-     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
-     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
-
-From 207ea47b2153b276b53cd5a87528dbc532a9f551 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 20 Apr 2023 11:26:10 +0000
-Subject: [PATCH 124/136] swscale: Add unscaled XRGB->YUV420P functions
-
-(cherry picked from commit 04cc32ee3f390de513ad8c6156c0c66b2c60abc8)
----
- libswscale/rgb2rgb.c          |  20 ++++++
- libswscale/rgb2rgb.h          |  16 +++++
- libswscale/rgb2rgb_template.c | 123 ++++++++++++++++++++++++++++++----
- libswscale/swscale_unscaled.c |  89 ++++++++++++++++++++++++
- 4 files changed, 236 insertions(+), 12 deletions(-)
-
-diff --git a/libswscale/rgb2rgb.c b/libswscale/rgb2rgb.c
-index 84bb56e60e..c3b9079d2b 100644
---- a/libswscale/rgb2rgb.c
-+++ b/libswscale/rgb2rgb.c
-@@ -88,6 +88,26 @@ void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst,
-                        int width, int height,
-                        int lumStride, int chromStride, int srcStride,
-                        int32_t *rgb2yuv);
-+void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst,
-+					  uint8_t *udst, uint8_t *vdst,
-+					  int width, int height,
-+					  int lumStride, int chromStride, int srcStride,
-+					  int32_t *rgb2yuv);
-+void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst,
-+					  uint8_t *udst, uint8_t *vdst,
-+					  int width, int height,
-+					  int lumStride, int chromStride, int srcStride,
-+					  int32_t *rgb2yuv);
-+void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst,
-+					  uint8_t *udst, uint8_t *vdst,
-+					  int width, int height,
-+					  int lumStride, int chromStride, int srcStride,
-+					  int32_t *rgb2yuv);
-+void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst,
-+					  uint8_t *udst, uint8_t *vdst,
-+					  int width, int height,
-+					  int lumStride, int chromStride, int srcStride,
-+					  int32_t *rgb2yuv);
- void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
-                  int srcStride, int dstStride);
- void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
-diff --git a/libswscale/rgb2rgb.h b/libswscale/rgb2rgb.h
-index 0028ab345f..a0dd3ffb79 100644
---- a/libswscale/rgb2rgb.h
-+++ b/libswscale/rgb2rgb.h
-@@ -135,6 +135,22 @@ extern void (*ff_bgr24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                               int width, int height,
-                               int lumStride, int chromStride, int srcStride,
-                               int32_t *rgb2yuv);
-+extern void (*ff_rgbxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-+                             int width, int height,
-+                             int lumStride, int chromStride, int srcStride,
-+                             int32_t *rgb2yuv);
-+extern void (*ff_bgrxtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-+                             int width, int height,
-+                             int lumStride, int chromStride, int srcStride,
-+                             int32_t *rgb2yuv);
-+extern void (*ff_xrgbtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-+                             int width, int height,
-+                             int lumStride, int chromStride, int srcStride,
-+                             int32_t *rgb2yuv);
-+extern void (*ff_xbgrtoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
-+                             int width, int height,
-+                             int lumStride, int chromStride, int srcStride,
-+                             int32_t *rgb2yuv);
- extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
-                         int srcStride, int dstStride);
- 
-diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
-index e2437826dd..703de90690 100644
---- a/libswscale/rgb2rgb_template.c
-+++ b/libswscale/rgb2rgb_template.c
-@@ -708,30 +708,125 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-     }
- }
- 
-+static const uint8_t x_rgb[9] = {
-+    RY_IDX, GY_IDX, BY_IDX,
-+    RU_IDX, GU_IDX, BU_IDX,
-+    RV_IDX, GV_IDX, BV_IDX,
-+};
-+
-+static const uint8_t x_bgr[9] = {
-+     BY_IDX, GY_IDX, RY_IDX,
-+     BU_IDX, GU_IDX, RU_IDX,
-+     BV_IDX, GV_IDX, RV_IDX,
-+};
-+
- void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                    uint8_t *vdst, int width, int height, int lumStride,
-                    int chromStride, int srcStride, int32_t *rgb2yuv)
- {
--    static const uint8_t x[9] = {
--        RY_IDX, GY_IDX, BY_IDX,
--        RU_IDX, GU_IDX, BU_IDX,
--        RV_IDX, GV_IDX, BV_IDX,
--    };
--    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
-+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
- }
- 
- void ff_bgr24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                    uint8_t *vdst, int width, int height, int lumStride,
-                    int chromStride, int srcStride, int32_t *rgb2yuv)
- {
--    static const uint8_t x[9] = {
--         BY_IDX, GY_IDX, RY_IDX,
--         BU_IDX, GU_IDX, RU_IDX,
--         BV_IDX, GV_IDX, RV_IDX,
--    };
--    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x);
-+    rgb24toyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
- }
- 
-+static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv,
-+                   const uint8_t x[9])
-+{
-+    int32_t ry = rgb2yuv[x[0]], gy = rgb2yuv[x[1]], by = rgb2yuv[x[2]];
-+    int32_t ru = rgb2yuv[x[3]], gu = rgb2yuv[x[4]], bu = rgb2yuv[x[5]];
-+    int32_t rv = rgb2yuv[x[6]], gv = rgb2yuv[x[7]], bv = rgb2yuv[x[8]];
-+    int y;
-+    const int chromWidth = width >> 1;
-+
-+    for (y = 0; y < height; y += 2) {
-+        int i;
-+        for (i = 0; i < chromWidth; i++) {
-+            unsigned int b = src[8 * i + 2];
-+            unsigned int g = src[8 * i + 1];
-+            unsigned int r = src[8 * i + 0];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
-+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
-+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
-+
-+            udst[i]     = U;
-+            vdst[i]     = V;
-+            ydst[2 * i] = Y;
-+
-+            b = src[8 * i + 6];
-+            g = src[8 * i + 5];
-+            r = src[8 * i + 4];
-+
-+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-+            ydst[2 * i + 1] = Y;
-+        }
-+        ydst += lumStride;
-+        src  += srcStride;
-+
-+        if (y+1 == height)
-+            break;
-+
-+        for (i = 0; i < chromWidth; i++) {
-+            unsigned int b = src[8 * i + 2];
-+            unsigned int g = src[8 * i + 1];
-+            unsigned int r = src[8 * i + 0];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-+
-+            ydst[2 * i] = Y;
-+
-+            b = src[8 * i + 6];
-+            g = src[8 * i + 5];
-+            r = src[8 * i + 4];
-+
-+            Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-+            ydst[2 * i + 1] = Y;
-+        }
-+        udst += chromStride;
-+        vdst += chromStride;
-+        ydst += lumStride;
-+        src  += srcStride;
-+    }
-+}
-+
-+static void ff_rgbxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
-+}
-+
-+static void ff_bgrxtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    rgbxtoyv12_x(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
-+}
-+
-+// As the general code does no SIMD-like ops simply adding 1 to the src address
-+// will fix the ignored alpha position
-+static void ff_xrgbtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_rgb);
-+}
-+
-+static void ff_xbgrtoyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    rgbxtoyv12_x(src + 1, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv, x_bgr);
-+}
-+
-+
- static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
-                               uint8_t *dest, int width, int height,
-                               int src1Stride, int src2Stride, int dstStride)
-@@ -1006,6 +1101,10 @@ static av_cold void rgb2rgb_init_c(void)
-     planar2x           = planar2x_c;
-     ff_rgb24toyv12     = ff_rgb24toyv12_c;
-     ff_bgr24toyv12     = ff_bgr24toyv12_c;
-+    ff_rgbxtoyv12      = ff_rgbxtoyv12_c;
-+    ff_bgrxtoyv12      = ff_bgrxtoyv12_c;
-+    ff_xrgbtoyv12      = ff_xrgbtoyv12_c;
-+    ff_xbgrtoyv12      = ff_xbgrtoyv12_c;
-     interleaveBytes    = interleaveBytes_c;
-     deinterleaveBytes  = deinterleaveBytes_c;
-     vu9_to_vu12        = vu9_to_vu12_c;
-diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
-index 9047030ae4..053c06adf5 100644
---- a/libswscale/swscale_unscaled.c
-+++ b/libswscale/swscale_unscaled.c
-@@ -1671,6 +1671,74 @@ static int rgb24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-     return srcSliceH;
- }
- 
-+static int bgrxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-+                             int srcStride[], int srcSliceY, int srcSliceH,
-+                             uint8_t *dst[], int dstStride[])
-+{
-+    ff_bgrxtoyv12(
-+        src[0],
-+        dst[0] +  srcSliceY       * dstStride[0],
-+        dst[1] + (srcSliceY >> 1) * dstStride[1],
-+        dst[2] + (srcSliceY >> 1) * dstStride[2],
-+        c->srcW, srcSliceH,
-+        dstStride[0], dstStride[1], srcStride[0],
-+        c->input_rgb2yuv_table);
-+    if (dst[3])
-+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
-+    return srcSliceH;
-+}
-+
-+static int rgbxToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-+                             int srcStride[], int srcSliceY, int srcSliceH,
-+                             uint8_t *dst[], int dstStride[])
-+{
-+    ff_rgbxtoyv12(
-+        src[0],
-+        dst[0] +  srcSliceY       * dstStride[0],
-+        dst[1] + (srcSliceY >> 1) * dstStride[1],
-+        dst[2] + (srcSliceY >> 1) * dstStride[2],
-+        c->srcW, srcSliceH,
-+        dstStride[0], dstStride[1], srcStride[0],
-+        c->input_rgb2yuv_table);
-+    if (dst[3])
-+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
-+    return srcSliceH;
-+}
-+
-+static int xbgrToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-+                             int srcStride[], int srcSliceY, int srcSliceH,
-+                             uint8_t *dst[], int dstStride[])
-+{
-+    ff_xbgrtoyv12(
-+        src[0],
-+        dst[0] +  srcSliceY       * dstStride[0],
-+        dst[1] + (srcSliceY >> 1) * dstStride[1],
-+        dst[2] + (srcSliceY >> 1) * dstStride[2],
-+        c->srcW, srcSliceH,
-+        dstStride[0], dstStride[1], srcStride[0],
-+        c->input_rgb2yuv_table);
-+    if (dst[3])
-+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
-+    return srcSliceH;
-+}
-+
-+static int xrgbToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-+                             int srcStride[], int srcSliceY, int srcSliceH,
-+                             uint8_t *dst[], int dstStride[])
-+{
-+    ff_xrgbtoyv12(
-+        src[0],
-+        dst[0] +  srcSliceY       * dstStride[0],
-+        dst[1] + (srcSliceY >> 1) * dstStride[1],
-+        dst[2] + (srcSliceY >> 1) * dstStride[2],
-+        c->srcW, srcSliceH,
-+        dstStride[0], dstStride[1], srcStride[0],
-+        c->input_rgb2yuv_table);
-+    if (dst[3])
-+        fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
-+    return srcSliceH;
-+}
-+
- static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
-                              int srcStride[], int srcSliceY, int srcSliceH,
-                              uint8_t *dst[], int dstStride[])
-@@ -2060,6 +2128,27 @@ void ff_get_unscaled_swscale(SwsContext *c)
-         !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-         c->convert_unscaled = rgb24ToYv12Wrapper;
- 
-+    /* bgrxtoYV12 */
-+    if (((srcFormat == AV_PIX_FMT_BGRA && dstFormat == AV_PIX_FMT_YUV420P) ||
-+         (srcFormat == AV_PIX_FMT_BGR0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-+        !(flags & SWS_ACCURATE_RND))
-+        c->convert_unscaled = bgrxToYv12Wrapper;
-+    /* rgbx24toYV12 */
-+    if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
-+         (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        c->convert_unscaled = rgbxToYv12Wrapper;
-+    /* xbgrtoYV12 */
-+    if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
-+         (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        c->convert_unscaled = xbgrToYv12Wrapper;
-+    /* xrgb24toYV12 */
-+    if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
-+         (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
-+        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        c->convert_unscaled = xrgbToYv12Wrapper;
-+
-     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
-     if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
-         && (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
-
-From b5672a2d361ec4f064ae116a3452282996cc87a0 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 20 Apr 2023 11:35:44 +0000
-Subject: [PATCH 125/136] swscale: Add aarch64 unscaled RGB24->YUV420P
-
-(cherry picked from commit 0cf416312095ce5bea3d2f7e9b14736d4b3ed160)
----
- libswscale/aarch64/rgb2rgb.c      |  40 +++++++
- libswscale/aarch64/rgb2rgb_neon.S | 181 ++++++++++++++++++++++++++++++
- 2 files changed, 221 insertions(+)
-
-diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
-index a9bf6ff9e0..6d3e0000dc 100644
---- a/libswscale/aarch64/rgb2rgb.c
-+++ b/libswscale/aarch64/rgb2rgb.c
-@@ -30,6 +30,44 @@
- void ff_interleave_bytes_neon(const uint8_t *src1, const uint8_t *src2,
-                               uint8_t *dest, int width, int height,
-                               int src1Stride, int src2Stride, int dstStride);
-+void ff_bgr24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv);
-+void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv);
-+
-+// RGB to YUV asm fns process 16 pixels at once so ensure that the output
-+// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
-+// don't test for that
-+// Fall back to C if we cannot use asm
-+
-+static inline int chkw(const int width, const int lumStride, const int chromStride)
-+{
-+    const int aw = FFALIGN(width, 16);
-+    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
-+}
-+
-+static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *rgb2yuv)
-+{
-+    if (chkw(width, lumStride, chromStride))
-+        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-+    else
-+        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
-+}
-+
-+static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-+                   uint8_t *vdst, int width, int height, int lumStride,
-+                   int chromStride, int srcStride, int32_t *bgr2yuv)
-+{
-+    if (chkw(width, lumStride, chromStride))
-+        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-+    else
-+        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
-+}
-+
- 
- av_cold void rgb2rgb_init_aarch64(void)
- {
-@@ -37,5 +75,7 @@ av_cold void rgb2rgb_init_aarch64(void)
- 
-     if (have_neon(cpu_flags)) {
-         interleaveBytes = ff_interleave_bytes_neon;
-+        ff_rgb24toyv12 = rgb24toyv12_check;
-+        ff_bgr24toyv12 = bgr24toyv12_check;
-     }
- }
-diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
-index d81110ec57..8cf40b65f5 100644
---- a/libswscale/aarch64/rgb2rgb_neon.S
-+++ b/libswscale/aarch64/rgb2rgb_neon.S
-@@ -77,3 +77,184 @@ function ff_interleave_bytes_neon, export=1
- 0:
-         ret
- endfunc
-+
-+// void ff_rgb24toyv12_aarch64(
-+//              const uint8_t *src,             // x0
-+//              uint8_t *ydst,                  // x1
-+//              uint8_t *udst,                  // x2
-+//              uint8_t *vdst,                  // x3
-+//              int width,                      // w4
-+//              int height,                     // w5
-+//              int lumStride,                  // w6
-+//              int chromStride,                // w7
-+//              int srcStr,                     // [sp, #0]
-+//              int32_t *rgb2yuv);              // [sp, #8]
-+
-+function ff_rgb24toyv12_aarch64, export=1
-+        ldr             x15, [sp, #8]
-+        ld1             {v3.s}[2], [x15], #4
-+        ld1             {v3.s}[1], [x15], #4
-+        ld1             {v3.s}[0], [x15], #4
-+        ld1             {v4.s}[2], [x15], #4
-+        ld1             {v4.s}[1], [x15], #4
-+        ld1             {v4.s}[0], [x15], #4
-+        ld1             {v5.s}[2], [x15], #4
-+        ld1             {v5.s}[1], [x15], #4
-+        ld1             {v5.s}[0], [x15]
-+        b               99f
-+endfunc
-+
-+// void ff_bgr24toyv12_aarch64(
-+//              const uint8_t *src,             // x0
-+//              uint8_t *ydst,                  // x1
-+//              uint8_t *udst,                  // x2
-+//              uint8_t *vdst,                  // x3
-+//              int width,                      // w4
-+//              int height,                     // w5
-+//              int lumStride,                  // w6
-+//              int chromStride,                // w7
-+//              int srcStr,                     // [sp, #0]
-+//              int32_t *rgb2yuv);              // [sp, #8]
-+
-+function ff_bgr24toyv12_aarch64, export=1
-+        ldr             x15, [sp, #8]
-+        ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
-+        ld3             {v3.s, v4.s, v5.s}[1], [x15], #12
-+        ld3             {v3.s, v4.s, v5.s}[2], [x15]
-+99:
-+        ldr             w14, [sp, #0]
-+        movi            v18.8b, #128
-+        uxtl            v17.8h, v18.8b
-+
-+        // Even line - YUV
-+1:
-+        mov             x10, x0
-+        mov             x11, x1
-+        mov             x12, x2
-+        mov             x13, x3
-+        mov             w9,  w4
-+
-+0:
-+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+
-+        uxtl2           v20.8h, v0.16b
-+        uxtl2           v21.8h, v1.16b
-+        uxtl2           v22.8h, v2.16b
-+
-+        uxtl            v0.8h, v0.8b
-+        uxtl            v1.8h, v1.8b
-+        uxtl            v2.8h, v2.8b
-+        // Y0
-+        smull           v6.4s, v0.4h, v3.h[0]
-+        smull2          v7.4s, v0.8h, v3.h[0]
-+        smlal           v6.4s, v1.4h, v4.h[0]
-+        smlal2          v7.4s, v1.8h, v4.h[0]
-+        smlal           v6.4s, v2.4h, v5.h[0]
-+        smlal2          v7.4s, v2.8h, v5.h[0]
-+        shrn            v6.4h, v6.4s, #12
-+        shrn2           v6.8h, v7.4s, #12
-+        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
-+        uqrshrn         v16.8b, v6.8h, #3
-+        // Y1
-+        smull           v6.4s, v20.4h, v3.h[0]
-+        smull2          v7.4s, v20.8h, v3.h[0]
-+        smlal           v6.4s, v21.4h, v4.h[0]
-+        smlal2          v7.4s, v21.8h, v4.h[0]
-+        smlal           v6.4s, v22.4h, v5.h[0]
-+        smlal2          v7.4s, v22.8h, v5.h[0]
-+        shrn            v6.4h, v6.4s, #12
-+        shrn2           v6.8h, v7.4s, #12
-+        add             v6.8h, v6.8h, v17.8h
-+        uqrshrn2        v16.16b, v6.8h, #3
-+        // Y0/Y1
-+        st1             {v16.16b}, [x11], #16
-+
-+        uzp1            v0.8h, v0.8h, v20.8h
-+        uzp1            v1.8h, v1.8h, v21.8h
-+        uzp1            v2.8h, v2.8h, v22.8h
-+
-+        // U
-+        // Vector subscript *2 as we loaded into S but are only using H
-+        smull           v6.4s, v0.4h, v3.h[2]
-+        smull2          v7.4s, v0.8h, v3.h[2]
-+        smlal           v6.4s, v1.4h, v4.h[2]
-+        smlal2          v7.4s, v1.8h, v4.h[2]
-+        smlal           v6.4s, v2.4h, v5.h[2]
-+        smlal2          v7.4s, v2.8h, v5.h[2]
-+        shrn            v6.4h, v6.4s, #14
-+        shrn2           v6.8h, v7.4s, #14
-+        sqrshrn         v6.8b, v6.8h, #1
-+        add             v6.8b, v6.8b, v18.8b     // +128
-+        st1             {v6.8b}, [x12], #8
-+
-+        // V
-+        smull           v6.4s, v0.4h, v3.h[4]
-+        smull2          v7.4s, v0.8h, v3.h[4]
-+        smlal           v6.4s, v1.4h, v4.h[4]
-+        smlal2          v7.4s, v1.8h, v4.h[4]
-+        smlal           v6.4s, v2.4h, v5.h[4]
-+        smlal2          v7.4s, v2.8h, v5.h[4]
-+        shrn            v6.4h, v6.4s, #14
-+        shrn2           v6.8h, v7.4s, #14
-+        sqrshrn         v6.8b, v6.8h, #1
-+        add             v6.8b, v6.8b, v18.8b     // +128
-+        st1             {v6.8b}, [x13], #8
-+
-+        subs            w9, w9, #16
-+        b.gt            0b
-+
-+        // Odd line - Y only
-+
-+        add             x0, x0, w14, SXTX
-+        add             x1, x1, w6, SXTX
-+        mov             x10, x0
-+        mov             x11, x1
-+        mov             w9,  w4
-+
-+0:
-+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+
-+        uxtl2           v20.8h, v0.16b
-+        uxtl2           v21.8h, v1.16b
-+        uxtl2           v22.8h, v2.16b
-+
-+        uxtl            v0.8h, v0.8b
-+        uxtl            v1.8h, v1.8b
-+        uxtl            v2.8h, v2.8b
-+        // Y0
-+        smull           v6.4s, v0.4h, v3.h[0]
-+        smull2          v7.4s, v0.8h, v3.h[0]
-+        smlal           v6.4s, v1.4h, v4.h[0]
-+        smlal2          v7.4s, v1.8h, v4.h[0]
-+        smlal           v6.4s, v2.4h, v5.h[0]
-+        smlal2          v7.4s, v2.8h, v5.h[0]
-+        shrn            v6.4h, v6.4s, #12
-+        shrn2           v6.8h, v7.4s, #12
-+        add             v6.8h, v6.8h, v17.8h
-+        uqrshrn         v16.8b, v6.8h, #3
-+        // Y1
-+        smull           v6.4s, v20.4h, v3.h[0]
-+        smull2          v7.4s, v20.8h, v3.h[0]
-+        smlal           v6.4s, v21.4h, v4.h[0]
-+        smlal2          v7.4s, v21.8h, v4.h[0]
-+        smlal           v6.4s, v22.4h, v5.h[0]
-+        smlal2          v7.4s, v22.8h, v5.h[0]
-+        shrn            v6.4h, v6.4s, #12
-+        shrn2           v6.8h, v7.4s, #12
-+        add             v6.8h, v6.8h, v17.8h
-+        uqrshrn2        v16.16b, v6.8h, #3
-+        // Y0/Y1
-+        st1             {v16.16b}, [x11], #16
-+
-+        subs            w9, w9, #16
-+        b.gt            0b
-+
-+        add             x0, x0, w14, SXTX
-+        add             x1, x1, w6, SXTX
-+        add             x2, x2, w7, SXTX
-+        add             x3, x3, w7, SXTX
-+        subs            w5, w5, #2
-+        b.gt            1b
-+
-+        ret
-+endfunc
-
-From f62603136ee2eaf781519bd70e445b03f80960da Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 27 Apr 2023 13:03:52 +0000
-Subject: [PATCH 126/136] rgb2rgb: Fix rgb24->yuv420p with arbitrary wxh
-
-(cherry picked from commit 58771fdf0218dc670d8a343824f540e2f6e8785d)
----
- libswscale/aarch64/rgb2rgb.c      |   5 +-
- libswscale/aarch64/rgb2rgb_neon.S | 440 ++++++++++++++++++++++++------
- 2 files changed, 355 insertions(+), 90 deletions(-)
-
-diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
-index 6d3e0000dc..f10c4ef2de 100644
---- a/libswscale/aarch64/rgb2rgb.c
-+++ b/libswscale/aarch64/rgb2rgb.c
-@@ -44,8 +44,9 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
- 
- static inline int chkw(const int width, const int lumStride, const int chromStride)
- {
--    const int aw = FFALIGN(width, 16);
--    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
-+//    const int aw = FFALIGN(width, 16);
-+//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
-+    return 1;
- }
- 
- static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
-index 8cf40b65f5..978ab443ea 100644
---- a/libswscale/aarch64/rgb2rgb_neon.S
-+++ b/libswscale/aarch64/rgb2rgb_neon.S
-@@ -116,6 +116,25 @@ endfunc
- //              int srcStr,                     // [sp, #0]
- //              int32_t *rgb2yuv);              // [sp, #8]
- 
-+// regs
-+// v0-2         Src bytes - reused as chroma src
-+// v3-5         Coeffs (packed very inefficiently - could be squashed)
-+// v6           128b
-+// v7           128h
-+// v8-15        Reserved
-+// v16-18       Lo Src expanded as H
-+// v19          -
-+// v20-22       Hi Src expanded as H
-+// v23          -
-+// v24          U out
-+// v25          U tmp
-+// v26          Y out
-+// v27-29       Y tmp
-+// v30          V out
-+// v31          V tmp
-+
-+// Assumes Little Endian in tail stores & conversion matrix
-+
- function ff_bgr24toyv12_aarch64, export=1
-         ldr             x15, [sp, #8]
-         ld3             {v3.s, v4.s, v5.s}[0], [x15], #12
-@@ -123,138 +142,383 @@ function ff_bgr24toyv12_aarch64, export=1
-         ld3             {v3.s, v4.s, v5.s}[2], [x15]
- 99:
-         ldr             w14, [sp, #0]
--        movi            v18.8b, #128
--        uxtl            v17.8h, v18.8b
--
--        // Even line - YUV
-+        movi            v7.8b, #128
-+        uxtl            v6.8h, v7.8b
-+        // Ensure if nothing to do then we do nothing
-+        cmp             w4, #0
-+        b.le            90f
-+        cmp             w5, #0
-+        b.le            90f
-+        // If w % 16 != 0 then -16 so we do main loop 1 fewer times with
-+        // the remainder done in the tail
-+        tst             w4, #15
-+        b.eq            1f
-+        sub             w4, w4, #16
- 1:
-+
-+// -------------------- Even line body - YUV
-+11:
-+        subs            w9,  w4, #0
-         mov             x10, x0
-         mov             x11, x1
-         mov             x12, x2
-         mov             x13, x3
--        mov             w9,  w4
-+        b.lt            12f
- 
--0:
-         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+        subs            w9, w9, #16
-+        b.le            13f
-+
-+10:
-+        uxtl            v16.8h, v0.8b
-+        uxtl            v17.8h, v1.8b
-+        uxtl            v18.8h, v2.8b
- 
-         uxtl2           v20.8h, v0.16b
-         uxtl2           v21.8h, v1.16b
-         uxtl2           v22.8h, v2.16b
- 
--        uxtl            v0.8h, v0.8b
--        uxtl            v1.8h, v1.8b
--        uxtl            v2.8h, v2.8b
-+        bic             v0.8h, #0xff, LSL #8
-+        bic             v1.8h, #0xff, LSL #8
-+        bic             v2.8h, #0xff, LSL #8
-+
-+        // Testing shows it is faster to stack the smull/smlal ops together
-+        // rather than interleave them between channels and indeed even the
-+        // shift/add sections seem happier not interleaved
-+
-         // Y0
--        smull           v6.4s, v0.4h, v3.h[0]
--        smull2          v7.4s, v0.8h, v3.h[0]
--        smlal           v6.4s, v1.4h, v4.h[0]
--        smlal2          v7.4s, v1.8h, v4.h[0]
--        smlal           v6.4s, v2.4h, v5.h[0]
--        smlal2          v7.4s, v2.8h, v5.h[0]
--        shrn            v6.4h, v6.4s, #12
--        shrn2           v6.8h, v7.4s, #12
--        add             v6.8h, v6.8h, v17.8h     // +128 (>> 3 = 16)
--        uqrshrn         v16.8b, v6.8h, #3
-+        smull           v26.4s, v16.4h, v3.h[0]
-+        smlal           v26.4s, v17.4h, v4.h[0]
-+        smlal           v26.4s, v18.4h, v5.h[0]
-+        smull2          v27.4s, v16.8h, v3.h[0]
-+        smlal2          v27.4s, v17.8h, v4.h[0]
-+        smlal2          v27.4s, v18.8h, v5.h[0]
-         // Y1
--        smull           v6.4s, v20.4h, v3.h[0]
--        smull2          v7.4s, v20.8h, v3.h[0]
--        smlal           v6.4s, v21.4h, v4.h[0]
--        smlal2          v7.4s, v21.8h, v4.h[0]
--        smlal           v6.4s, v22.4h, v5.h[0]
--        smlal2          v7.4s, v22.8h, v5.h[0]
--        shrn            v6.4h, v6.4s, #12
--        shrn2           v6.8h, v7.4s, #12
--        add             v6.8h, v6.8h, v17.8h
--        uqrshrn2        v16.16b, v6.8h, #3
-+        smull           v28.4s, v20.4h, v3.h[0]
-+        smlal           v28.4s, v21.4h, v4.h[0]
-+        smlal           v28.4s, v22.4h, v5.h[0]
-+        smull2          v29.4s, v20.8h, v3.h[0]
-+        smlal2          v29.4s, v21.8h, v4.h[0]
-+        smlal2          v29.4s, v22.8h, v5.h[0]
-+        shrn            v26.4h, v26.4s, #12
-+        shrn2           v26.8h, v27.4s, #12
-+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-+        uqrshrn         v26.8b, v26.8h, #3
-+        shrn            v28.4h, v28.4s, #12
-+        shrn2           v28.8h, v29.4s, #12
-+        add             v28.8h, v28.8h, v6.8h
-+        uqrshrn2        v26.16b, v28.8h, #3
-         // Y0/Y1
--        st1             {v16.16b}, [x11], #16
--
--        uzp1            v0.8h, v0.8h, v20.8h
--        uzp1            v1.8h, v1.8h, v21.8h
--        uzp1            v2.8h, v2.8h, v22.8h
- 
-         // U
-         // Vector subscript *2 as we loaded into S but are only using H
--        smull           v6.4s, v0.4h, v3.h[2]
--        smull2          v7.4s, v0.8h, v3.h[2]
--        smlal           v6.4s, v1.4h, v4.h[2]
--        smlal2          v7.4s, v1.8h, v4.h[2]
--        smlal           v6.4s, v2.4h, v5.h[2]
--        smlal2          v7.4s, v2.8h, v5.h[2]
--        shrn            v6.4h, v6.4s, #14
--        shrn2           v6.8h, v7.4s, #14
--        sqrshrn         v6.8b, v6.8h, #1
--        add             v6.8b, v6.8b, v18.8b     // +128
--        st1             {v6.8b}, [x12], #8
-+        smull           v24.4s, v0.4h, v3.h[2]
-+        smlal           v24.4s, v1.4h, v4.h[2]
-+        smlal           v24.4s, v2.4h, v5.h[2]
-+        smull2          v25.4s, v0.8h, v3.h[2]
-+        smlal2          v25.4s, v1.8h, v4.h[2]
-+        smlal2          v25.4s, v2.8h, v5.h[2]
- 
-         // V
--        smull           v6.4s, v0.4h, v3.h[4]
--        smull2          v7.4s, v0.8h, v3.h[4]
--        smlal           v6.4s, v1.4h, v4.h[4]
--        smlal2          v7.4s, v1.8h, v4.h[4]
--        smlal           v6.4s, v2.4h, v5.h[4]
--        smlal2          v7.4s, v2.8h, v5.h[4]
--        shrn            v6.4h, v6.4s, #14
--        shrn2           v6.8h, v7.4s, #14
--        sqrshrn         v6.8b, v6.8h, #1
--        add             v6.8b, v6.8b, v18.8b     // +128
--        st1             {v6.8b}, [x13], #8
-+        smull           v30.4s, v0.4h, v3.h[4]
-+        smlal           v30.4s, v1.4h, v4.h[4]
-+        smlal           v30.4s, v2.4h, v5.h[4]
-+        smull2          v31.4s, v0.8h, v3.h[4]
-+        smlal2          v31.4s, v1.8h, v4.h[4]
-+        smlal2          v31.4s, v2.8h, v5.h[4]
-+
-+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+
-+        shrn            v24.4h, v24.4s, #14
-+        shrn2           v24.8h, v25.4s, #14
-+        sqrshrn         v24.8b, v24.8h, #1
-+        add             v24.8b, v24.8b, v7.8b     // +128
-+        shrn            v30.4h, v30.4s, #14
-+        shrn2           v30.8h, v31.4s, #14
-+        sqrshrn         v30.8b, v30.8h, #1
-+        add             v30.8b, v30.8b, v7.8b     // +128
- 
-         subs            w9, w9, #16
--        b.gt            0b
- 
--        // Odd line - Y only
-+        st1             {v26.16b}, [x11], #16
-+        st1             {v24.8b}, [x12], #8
-+        st1             {v30.8b}, [x13], #8
-+
-+        b.gt            10b
-+
-+// -------------------- Even line tail - YUV
-+// If width % 16 == 0 then simply runs once with preloaded RGB
-+// If other then deals with preload & then does remaining tail
-+
-+13:
-+        // Body is simple copy of main loop body minus preload
-+
-+        uxtl            v16.8h, v0.8b
-+        uxtl            v17.8h, v1.8b
-+        uxtl            v18.8h, v2.8b
-+
-+        uxtl2           v20.8h, v0.16b
-+        uxtl2           v21.8h, v1.16b
-+        uxtl2           v22.8h, v2.16b
-+
-+        bic             v0.8h, #0xff, LSL #8
-+        bic             v1.8h, #0xff, LSL #8
-+        bic             v2.8h, #0xff, LSL #8
-+
-+        // Y0
-+        smull           v26.4s, v16.4h, v3.h[0]
-+        smlal           v26.4s, v17.4h, v4.h[0]
-+        smlal           v26.4s, v18.4h, v5.h[0]
-+        smull2          v27.4s, v16.8h, v3.h[0]
-+        smlal2          v27.4s, v17.8h, v4.h[0]
-+        smlal2          v27.4s, v18.8h, v5.h[0]
-+        // Y1
-+        smull           v28.4s, v20.4h, v3.h[0]
-+        smlal           v28.4s, v21.4h, v4.h[0]
-+        smlal           v28.4s, v22.4h, v5.h[0]
-+        smull2          v29.4s, v20.8h, v3.h[0]
-+        smlal2          v29.4s, v21.8h, v4.h[0]
-+        smlal2          v29.4s, v22.8h, v5.h[0]
-+        shrn            v26.4h, v26.4s, #12
-+        shrn2           v26.8h, v27.4s, #12
-+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-+        uqrshrn         v26.8b, v26.8h, #3
-+        shrn            v28.4h, v28.4s, #12
-+        shrn2           v28.8h, v29.4s, #12
-+        add             v28.8h, v28.8h, v6.8h
-+        uqrshrn2        v26.16b, v28.8h, #3
-+        // Y0/Y1
-+
-+        // U
-+        // Vector subscript *2 as we loaded into S but are only using H
-+        smull           v24.4s, v0.4h, v3.h[2]
-+        smlal           v24.4s, v1.4h, v4.h[2]
-+        smlal           v24.4s, v2.4h, v5.h[2]
-+        smull2          v25.4s, v0.8h, v3.h[2]
-+        smlal2          v25.4s, v1.8h, v4.h[2]
-+        smlal2          v25.4s, v2.8h, v5.h[2]
- 
-+        // V
-+        smull           v30.4s, v0.4h, v3.h[4]
-+        smlal           v30.4s, v1.4h, v4.h[4]
-+        smlal           v30.4s, v2.4h, v5.h[4]
-+        smull2          v31.4s, v0.8h, v3.h[4]
-+        smlal2          v31.4s, v1.8h, v4.h[4]
-+        smlal2          v31.4s, v2.8h, v5.h[4]
-+
-+        cmp             w9, #-16
-+
-+        shrn            v24.4h, v24.4s, #14
-+        shrn2           v24.8h, v25.4s, #14
-+        sqrshrn         v24.8b, v24.8h, #1
-+        add             v24.8b, v24.8b, v7.8b     // +128
-+        shrn            v30.4h, v30.4s, #14
-+        shrn2           v30.8h, v31.4s, #14
-+        sqrshrn         v30.8b, v30.8h, #1
-+        add             v30.8b, v30.8b, v7.8b     // +128
-+
-+        // Here:
-+        // w9 == 0      width % 16 == 0, tail done
-+        // w9 > -16     1st tail done (16 pels), remainder still to go
-+        // w9 == -16    shouldn't happen
-+        // w9 > -32     2nd tail done
-+        // w9 <= -32    shouldn't happen
-+
-+        b.lt            2f
-+        st1             {v26.16b}, [x11], #16
-+        st1             {v24.8b}, [x12], #8
-+        st1             {v30.8b}, [x13], #8
-+        cbz             w9, 3f
-+
-+12:
-+        sub             w9, w9, #16
-+
-+        tbz             w9, #3, 1f
-+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
-+1:      tbz             w9, #2, 1f
-+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
-+1:      tbz             w9, #1, 1f
-+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
-+1:      tbz             w9, #0, 13b
-+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
-+        b               13b
-+
-+2:
-+        tbz             w9, #3, 1f
-+        st1             {v26.8b},    [x11], #8
-+        st1             {v24.s}[0],  [x12], #4
-+        st1             {v30.s}[0],  [x13], #4
-+1:      tbz             w9, #2, 1f
-+        st1             {v26.s}[2],  [x11], #4
-+        st1             {v24.h}[2],  [x12], #2
-+        st1             {v30.h}[2],  [x13], #2
-+1:      tbz             w9, #1, 1f
-+        st1             {v26.h}[6],  [x11], #2
-+        st1             {v24.b}[6],  [x12], #1
-+        st1             {v30.b}[6],  [x13], #1
-+1:      tbz             w9, #0, 1f
-+        st1             {v26.b}[14], [x11]
-+        st1             {v24.b}[7],  [x12]
-+        st1             {v30.b}[7],  [x13]
-+1:
-+3:
-+
-+// -------------------- Odd line body - Y only
-+
-+        subs            w5, w5, #1
-+        b.eq            90f
-+
-+        subs            w9,  w4, #0
-         add             x0, x0, w14, SXTX
-         add             x1, x1, w6, SXTX
-         mov             x10, x0
-         mov             x11, x1
--        mov             w9,  w4
-+        b.lt            12f
- 
--0:
-         ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+        subs            w9, w9, #16
-+        b.le            13f
-+
-+10:
-+        uxtl            v16.8h, v0.8b
-+        uxtl            v17.8h, v1.8b
-+        uxtl            v18.8h, v2.8b
- 
-         uxtl2           v20.8h, v0.16b
-         uxtl2           v21.8h, v1.16b
-         uxtl2           v22.8h, v2.16b
- 
--        uxtl            v0.8h, v0.8b
--        uxtl            v1.8h, v1.8b
--        uxtl            v2.8h, v2.8b
-+        // Testing shows it is faster to stack the smull/smlal ops together
-+        // rather than interleave them between channels and indeed even the
-+        // shift/add sections seem happier not interleaved
-+
-         // Y0
--        smull           v6.4s, v0.4h, v3.h[0]
--        smull2          v7.4s, v0.8h, v3.h[0]
--        smlal           v6.4s, v1.4h, v4.h[0]
--        smlal2          v7.4s, v1.8h, v4.h[0]
--        smlal           v6.4s, v2.4h, v5.h[0]
--        smlal2          v7.4s, v2.8h, v5.h[0]
--        shrn            v6.4h, v6.4s, #12
--        shrn2           v6.8h, v7.4s, #12
--        add             v6.8h, v6.8h, v17.8h
--        uqrshrn         v16.8b, v6.8h, #3
-+        smull           v26.4s, v16.4h, v3.h[0]
-+        smlal           v26.4s, v17.4h, v4.h[0]
-+        smlal           v26.4s, v18.4h, v5.h[0]
-+        smull2          v27.4s, v16.8h, v3.h[0]
-+        smlal2          v27.4s, v17.8h, v4.h[0]
-+        smlal2          v27.4s, v18.8h, v5.h[0]
-         // Y1
--        smull           v6.4s, v20.4h, v3.h[0]
--        smull2          v7.4s, v20.8h, v3.h[0]
--        smlal           v6.4s, v21.4h, v4.h[0]
--        smlal2          v7.4s, v21.8h, v4.h[0]
--        smlal           v6.4s, v22.4h, v5.h[0]
--        smlal2          v7.4s, v22.8h, v5.h[0]
--        shrn            v6.4h, v6.4s, #12
--        shrn2           v6.8h, v7.4s, #12
--        add             v6.8h, v6.8h, v17.8h
--        uqrshrn2        v16.16b, v6.8h, #3
-+        smull           v28.4s, v20.4h, v3.h[0]
-+        smlal           v28.4s, v21.4h, v4.h[0]
-+        smlal           v28.4s, v22.4h, v5.h[0]
-+        smull2          v29.4s, v20.8h, v3.h[0]
-+        smlal2          v29.4s, v21.8h, v4.h[0]
-+        smlal2          v29.4s, v22.8h, v5.h[0]
-+
-+        ld3             {v0.16b, v1.16b, v2.16b}, [x10], #48
-+
-+        shrn            v26.4h, v26.4s, #12
-+        shrn2           v26.8h, v27.4s, #12
-+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-+        uqrshrn         v26.8b, v26.8h, #3
-+        shrn            v28.4h, v28.4s, #12
-+        shrn2           v28.8h, v29.4s, #12
-+        add             v28.8h, v28.8h, v6.8h
-+        uqrshrn2        v26.16b, v28.8h, #3
-         // Y0/Y1
--        st1             {v16.16b}, [x11], #16
- 
-         subs            w9, w9, #16
--        b.gt            0b
-+
-+        st1             {v26.16b}, [x11], #16
-+
-+        b.gt            10b
-+
-+// -------------------- Odd line tail - Y
-+// If width % 16 == 0 then simply runs once with preloaded RGB
-+// If other then deals with preload & then does remaining tail
-+
-+13:
-+        // Body is simple copy of main loop body minus preload
-+
-+        uxtl            v16.8h, v0.8b
-+        uxtl            v17.8h, v1.8b
-+        uxtl            v18.8h, v2.8b
-+
-+        uxtl2           v20.8h, v0.16b
-+        uxtl2           v21.8h, v1.16b
-+        uxtl2           v22.8h, v2.16b
-+
-+        // Y0
-+        smull           v26.4s, v16.4h, v3.h[0]
-+        smlal           v26.4s, v17.4h, v4.h[0]
-+        smlal           v26.4s, v18.4h, v5.h[0]
-+        smull2          v27.4s, v16.8h, v3.h[0]
-+        smlal2          v27.4s, v17.8h, v4.h[0]
-+        smlal2          v27.4s, v18.8h, v5.h[0]
-+        // Y1
-+        smull           v28.4s, v20.4h, v3.h[0]
-+        smlal           v28.4s, v21.4h, v4.h[0]
-+        smlal           v28.4s, v22.4h, v5.h[0]
-+        smull2          v29.4s, v20.8h, v3.h[0]
-+        smlal2          v29.4s, v21.8h, v4.h[0]
-+        smlal2          v29.4s, v22.8h, v5.h[0]
-+
-+        cmp             w9, #-16
-+
-+        shrn            v26.4h, v26.4s, #12
-+        shrn2           v26.8h, v27.4s, #12
-+        add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
-+        uqrshrn         v26.8b, v26.8h, #3
-+        shrn            v28.4h, v28.4s, #12
-+        shrn2           v28.8h, v29.4s, #12
-+        add             v28.8h, v28.8h, v6.8h
-+        uqrshrn2        v26.16b, v28.8h, #3
-+        // Y0/Y1
-+
-+        // Here:
-+        // w9 == 0      width % 16 == 0, tail done
-+        // w9 > -16     1st tail done (16 pels), remainder still to go
-+        // w9 == -16    shouldn't happen
-+        // w9 > -32     2nd tail done
-+        // w9 <= -32    shouldn't happen
-+
-+        b.lt            2f
-+        st1             {v26.16b}, [x11], #16
-+        cbz             w9, 3f
-+
-+12:
-+        sub             w9, w9, #16
-+
-+        tbz             w9, #3, 1f
-+        ld3             {v0.8b, v1.8b, v2.8b},  [x10], #24
-+1:      tbz             w9, #2, 1f
-+        ld3             {v0.b, v1.b, v2.b}[8],  [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[9],  [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[10], [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[11], [x10], #3
-+1:      tbz             w9, #1, 1f
-+        ld3             {v0.b, v1.b, v2.b}[12], [x10], #3
-+        ld3             {v0.b, v1.b, v2.b}[13], [x10], #3
-+1:      tbz             w9, #0, 13b
-+        ld3             {v0.b, v1.b, v2.b}[14], [x10], #3
-+        b               13b
-+
-+2:
-+        tbz             w9, #3, 1f
-+        st1             {v26.8b},    [x11], #8
-+1:      tbz             w9, #2, 1f
-+        st1             {v26.s}[2],  [x11], #4
-+1:      tbz             w9, #1, 1f
-+        st1             {v26.h}[6],  [x11], #2
-+1:      tbz             w9, #0, 1f
-+        st1             {v26.b}[14], [x11]
-+1:
-+3:
-+
-+// ------------------- Loop to start
- 
-         add             x0, x0, w14, SXTX
-         add             x1, x1, w6, SXTX
-         add             x2, x2, w7, SXTX
-         add             x3, x3, w7, SXTX
--        subs            w5, w5, #2
--        b.gt            1b
--
-+        subs            w5, w5, #1
-+        b.gt            11b
-+90:
-         ret
- endfunc
-
-From cf020c89ac47620c4a5390d0333e9ea70fbfa7b8 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 26 Apr 2023 15:36:07 +0000
-Subject: [PATCH 127/136] rgb2rgb: Use asm unconditionally
-
-(cherry picked from commit 7c216c0804836b31c0ea093bb1dde5ab387724b1)
----
- libswscale/aarch64/rgb2rgb.c | 37 ++----------------------------------
- 1 file changed, 2 insertions(+), 35 deletions(-)
-
-diff --git a/libswscale/aarch64/rgb2rgb.c b/libswscale/aarch64/rgb2rgb.c
-index f10c4ef2de..6a0e2dcc09 100644
---- a/libswscale/aarch64/rgb2rgb.c
-+++ b/libswscale/aarch64/rgb2rgb.c
-@@ -37,46 +37,13 @@ void ff_rgb24toyv12_aarch64(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-                    uint8_t *vdst, int width, int height, int lumStride,
-                    int chromStride, int srcStride, int32_t *rgb2yuv);
- 
--// RGB to YUV asm fns process 16 pixels at once so ensure that the output
--// will fit into the stride. ARM64 should cope with unaligned SIMD r/w so
--// don't test for that
--// Fall back to C if we cannot use asm
--
--static inline int chkw(const int width, const int lumStride, const int chromStride)
--{
--//    const int aw = FFALIGN(width, 16);
--//    return aw <= FFABS(lumStride) && aw <= FFABS(chromStride) * 2;
--    return 1;
--}
--
--static void rgb24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
--                   uint8_t *vdst, int width, int height, int lumStride,
--                   int chromStride, int srcStride, int32_t *rgb2yuv)
--{
--    if (chkw(width, lumStride, chromStride))
--        ff_rgb24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
--    else
--        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, rgb2yuv);
--}
--
--static void bgr24toyv12_check(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
--                   uint8_t *vdst, int width, int height, int lumStride,
--                   int chromStride, int srcStride, int32_t *bgr2yuv)
--{
--    if (chkw(width, lumStride, chromStride))
--        ff_bgr24toyv12_aarch64(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
--    else
--        ff_bgr24toyv12_c(src, ydst, udst, vdst, width, height, lumStride, chromStride, srcStride, bgr2yuv);
--}
--
--
- av_cold void rgb2rgb_init_aarch64(void)
- {
-     int cpu_flags = av_get_cpu_flags();
- 
-     if (have_neon(cpu_flags)) {
-         interleaveBytes = ff_interleave_bytes_neon;
--        ff_rgb24toyv12 = rgb24toyv12_check;
--        ff_bgr24toyv12 = bgr24toyv12_check;
-+        ff_rgb24toyv12 = ff_rgb24toyv12_aarch64;
-+        ff_bgr24toyv12 = ff_bgr24toyv12_aarch64;
-     }
- }
-
-From 1895fdcaf403f403736ab52d1cb69dce7c964b66 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 27 Apr 2023 13:01:43 +0000
-Subject: [PATCH 128/136] tests/swscale: Add options for width and height on
- the command line
-
-(cherry picked from commit eb8a09779688fc05bf204fdfcd063b04cda07271)
----
- libswscale/tests/swscale.c | 84 ++++++++++++++++++++++++++------------
- 1 file changed, 59 insertions(+), 25 deletions(-)
-
-diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
-index 6c38041ddb..4cf41d9f64 100644
---- a/libswscale/tests/swscale.c
-+++ b/libswscale/tests/swscale.c
-@@ -355,56 +355,71 @@ static int fileTest(const uint8_t * const ref[4], int refStride[4],
-     return 0;
- }
- 
--#define W 96
--#define H 96
--
- int main(int argc, char **argv)
- {
-+    unsigned int W = 96;
-+    unsigned int H = 96;
-+    unsigned int W2;
-+    unsigned int H2;
-+    unsigned int S;
-     enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
-     enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
--    uint8_t *rgb_data   = av_malloc(W * H * 4);
--    const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
--    int rgb_stride[4]   = { 4 * W, 0, 0, 0 };
--    uint8_t *data       = av_malloc(4 * W * H);
--    const uint8_t * const src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
--    int stride[4]       = { W, W, W, W };
-     int x, y;
-     struct SwsContext *sws;
-     AVLFG rand;
-     int res = -1;
-     int i;
-     FILE *fp = NULL;
--
--    if (!rgb_data || !data)
--        return -1;
-+    uint8_t *rgb_data;
-+    uint8_t * rgb_src[4] = { NULL };
-+    int rgb_stride[4]   = { 0 };
-+    uint8_t *data;
-+    uint8_t * src[4] = { NULL };
-+    int stride[4]       = { 0 };
- 
-     for (i = 1; i < argc; i += 2) {
-+        const char * const arg2 = argv[i+1];
-+
-         if (argv[i][0] != '-' || i + 1 == argc)
-             goto bad_option;
-         if (!strcmp(argv[i], "-ref")) {
--            fp = fopen(argv[i + 1], "r");
-+            fp = fopen(arg2, "r");
-             if (!fp) {
--                fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
-+                fprintf(stderr, "could not open '%s'\n", arg2);
-                 goto error;
-             }
-         } else if (!strcmp(argv[i], "-cpuflags")) {
-             unsigned flags = av_get_cpu_flags();
--            int ret = av_parse_cpu_caps(&flags, argv[i + 1]);
-+            int ret = av_parse_cpu_caps(&flags, arg2);
-             if (ret < 0) {
--                fprintf(stderr, "invalid cpu flags %s\n", argv[i + 1]);
-+                fprintf(stderr, "invalid cpu flags %s\n", arg2);
-                 return ret;
-             }
-             av_force_cpu_flags(flags);
-         } else if (!strcmp(argv[i], "-src")) {
--            srcFormat = av_get_pix_fmt(argv[i + 1]);
-+            srcFormat = av_get_pix_fmt(arg2);
-             if (srcFormat == AV_PIX_FMT_NONE) {
--                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
-+                fprintf(stderr, "invalid pixel format %s\n", arg2);
-                 return -1;
-             }
-         } else if (!strcmp(argv[i], "-dst")) {
--            dstFormat = av_get_pix_fmt(argv[i + 1]);
-+            dstFormat = av_get_pix_fmt(arg2);
-             if (dstFormat == AV_PIX_FMT_NONE) {
--                fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
-+                fprintf(stderr, "invalid pixel format %s\n", arg2);
-+                return -1;
-+            }
-+        } else if (!strcmp(argv[i], "-w")) {
-+            char * p = NULL;
-+            W = strtoul(arg2, &p, 0);
-+            if (!W || *p) {
-+                fprintf(stderr, "bad width %s\n", arg2);
-+                return -1;
-+            }
-+        } else if (!strcmp(argv[i], "-h")) {
-+            char * p = NULL;
-+            H = strtoul(arg2, &p, 0);
-+            if (!H || *p) {
-+                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
-                 return -1;
-             }
-         } else {
-@@ -414,15 +429,34 @@ bad_option:
-         }
-     }
- 
--    sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
-+    S = (W + 15) & ~15;
-+    rgb_data   = av_mallocz(S * H * 4);
-+    rgb_src[0] = rgb_data;
-+    rgb_stride[0]   = 4 * S;
-+    data       = av_mallocz(4 * S * H);
-+    src[0] = data;
-+    src[1] = data + S * H;
-+    src[2] = data + S * H * 2;
-+    src[3] = data + S * H * 3;
-+    stride[0] = S;
-+    stride[1] = S;
-+    stride[2] = S;
-+    stride[3] = S;
-+    H2 = H < 96 ? 8 : H / 12;
-+    W2 = W < 96 ? 8 : W / 12;
-+
-+    if (!rgb_data || !data)
-+        return -1;
-+
-+    sws = sws_getContext(W2, H2, AV_PIX_FMT_RGB32, W, H,
-                          AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
- 
-     av_lfg_init(&rand, 1);
- 
-     for (y = 0; y < H; y++)
-         for (x = 0; x < W * 4; x++)
--            rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
--    res = sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, (uint8_t * const *) src, stride);
-+            rgb_data[ x + y * 4 * S] = av_lfg_get(&rand);
-+    res = sws_scale(sws, (const uint8_t * const *)rgb_src, rgb_stride, 0, H2, (uint8_t * const *) src, stride);
-     if (res < 0 || res != H) {
-         res = -1;
-         goto error;
-@@ -431,10 +465,10 @@ bad_option:
-     av_free(rgb_data);
- 
-     if(fp) {
--        res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
-+        res = fileTest((const uint8_t * const *)src, stride, W, H, fp, srcFormat, dstFormat);
-         fclose(fp);
-     } else {
--        selfTest(src, stride, W, H, srcFormat, dstFormat);
-+        selfTest((const uint8_t * const *)src, stride, W, H, srcFormat, dstFormat);
-         res = 0;
-     }
- error:
-
-From 94e48653a6bd1b8438887b486927e87b56651455 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 26 Apr 2023 16:31:23 +0000
-Subject: [PATCH 129/136] tests/swscale: Add a timing option
-
--t <n>   Where n is the number of time to loop the scale op.
-         Often useful to do it 10 times or so for better resolution
-
-(cherry picked from commit 50cd60a23a66254f911376602d07b30fcafbde96)
----
- libswscale/tests/swscale.c | 32 ++++++++++++++++++++++++++++++--
- 1 file changed, 30 insertions(+), 2 deletions(-)
-
-diff --git a/libswscale/tests/swscale.c b/libswscale/tests/swscale.c
-index 4cf41d9f64..12776ffec7 100644
---- a/libswscale/tests/swscale.c
-+++ b/libswscale/tests/swscale.c
-@@ -23,6 +23,7 @@
- #include <string.h>
- #include <inttypes.h>
- #include <stdarg.h>
-+#include <time.h>
- 
- #undef HAVE_AV_CONFIG_H
- #include "libavutil/cpu.h"
-@@ -78,6 +79,15 @@ struct Results {
-     uint32_t crc;
- };
- 
-+static int time_rep = 0;
-+
-+static uint64_t utime(void)
-+{
-+    struct timespec ts;
-+    clock_gettime(CLOCK_MONOTONIC, &ts);
-+    return ts.tv_nsec / 1000 + (uint64_t)ts.tv_sec * 1000000;
-+}
-+
- // test by ref -> src -> dst -> out & compare out against ref
- // ref & out are YV12
- static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
-@@ -174,7 +184,7 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
-         goto end;
-     }
- 
--    printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
-+    printf(" %s %4dx%4d -> %s %4dx%4d flags=%2d",
-            desc_src->name, srcW, srcH,
-            desc_dst->name, dstW, dstH,
-            flags);
-@@ -182,6 +192,17 @@ static int doTest(const uint8_t * const ref[4], int refStride[4], int w, int h,
- 
-     sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
- 
-+    if (time_rep != 0)
-+    {
-+        const uint64_t now = utime();
-+        uint64_t done;
-+        for (i = 1; i != time_rep; ++i) {
-+            sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
-+        }
-+        done = utime();
-+        printf(" T=%7"PRId64"us ", done-now);
-+    }
-+
-     for (i = 0; i < 4 && dstStride[i]; i++)
-         crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
-                      dstStride[i] * dstH);
-@@ -419,7 +440,14 @@ int main(int argc, char **argv)
-             char * p = NULL;
-             H = strtoul(arg2, &p, 0);
-             if (!H || *p) {
--                fprintf(stderr, "bad height '%s' (H=%d, *p=%d)\n", arg2, H, *p);
-+                fprintf(stderr, "bad height '%s'\n", arg2);
-+                return -1;
-+            }
-+        } else if (!strcmp(argv[i], "-t")) {
-+            char * p = NULL;
-+            time_rep = (int)strtol(arg2, &p, 0);
-+            if (*p) {
-+                fprintf(stderr, "bad time repetitions '%s'\n", arg2);
-                 return -1;
-             }
-         } else {
-
-From 406806d0b9d9cb113deb0d083a28cbccabab6825 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 20 Apr 2023 13:40:36 +0000
-Subject: [PATCH 130/136] swscale: RGB->YUV420 fix C template to allow odd
- widths
-
-(cherry picked from commit 08b2023e7b5292df0adc6593e4d20087f9cef5c8)
----
- libswscale/rgb2rgb_template.c | 44 +++++++++++++++++++++++++++++++++++
- libswscale/swscale_unscaled.c | 11 ++++-----
- 2 files changed, 49 insertions(+), 6 deletions(-)
-
-diff --git a/libswscale/rgb2rgb_template.c b/libswscale/rgb2rgb_template.c
-index 703de90690..e711589e1e 100644
---- a/libswscale/rgb2rgb_template.c
-+++ b/libswscale/rgb2rgb_template.c
-@@ -679,6 +679,19 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-             ydst[2 * i + 1] = Y;
-         }
-+        if ((width & 1) != 0) {
-+            unsigned int b = src[6 * i + 0];
-+            unsigned int g = src[6 * i + 1];
-+            unsigned int r = src[6 * i + 2];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
-+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
-+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
-+
-+            udst[i]     = U;
-+            vdst[i]     = V;
-+            ydst[2 * i] = Y;
-+        }
-         ydst += lumStride;
-         src  += srcStride;
- 
-@@ -701,6 +714,15 @@ static void rgb24toyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-             ydst[2 * i + 1] = Y;
-         }
-+        if ((width & 1) != 0) {
-+            unsigned int b = src[6 * i + 0];
-+            unsigned int g = src[6 * i + 1];
-+            unsigned int r = src[6 * i + 2];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-+
-+            ydst[2 * i] = Y;
-+        }
-         udst += chromStride;
-         vdst += chromStride;
-         ydst += lumStride;
-@@ -767,6 +789,19 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-             ydst[2 * i + 1] = Y;
-         }
-+        if ((width & 1) != 0) {
-+            unsigned int b = src[8 * i + 2];
-+            unsigned int g = src[8 * i + 1];
-+            unsigned int r = src[8 * i + 0];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) +  16;
-+            unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
-+            unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
-+
-+            udst[i]     = U;
-+            vdst[i]     = V;
-+            ydst[2 * i] = Y;
-+        }
-         ydst += lumStride;
-         src  += srcStride;
- 
-@@ -789,6 +824,15 @@ static void rgbxtoyv12_x(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
-             Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-             ydst[2 * i + 1] = Y;
-         }
-+        if ((width & 1) != 0) {
-+            unsigned int b = src[8 * i + 2];
-+            unsigned int g = src[8 * i + 1];
-+            unsigned int r = src[8 * i + 0];
-+
-+            unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
-+
-+            ydst[2 * i] = Y;
-+        }
-         udst += chromStride;
-         vdst += chromStride;
-         ydst += lumStride;
-diff --git a/libswscale/swscale_unscaled.c b/libswscale/swscale_unscaled.c
-index 053c06adf5..52469b2e4a 100644
---- a/libswscale/swscale_unscaled.c
-+++ b/libswscale/swscale_unscaled.c
-@@ -2062,7 +2062,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
-     const enum AVPixelFormat dstFormat = c->dstFormat;
-     const int flags = c->flags;
-     const int dstH = c->dstH;
--    const int dstW = c->dstW;
-     int needsDither;
- 
-     needsDither = isAnyRGB(dstFormat) &&
-@@ -2120,12 +2119,12 @@ void ff_get_unscaled_swscale(SwsContext *c)
-     /* bgr24toYV12 */
-     if (srcFormat == AV_PIX_FMT_BGR24 &&
-         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
--        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        !(flags & SWS_ACCURATE_RND))
-         c->convert_unscaled = bgr24ToYv12Wrapper;
-     /* rgb24toYV12 */
-     if (srcFormat == AV_PIX_FMT_RGB24 &&
-         (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
--        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        !(flags & SWS_ACCURATE_RND))
-         c->convert_unscaled = rgb24ToYv12Wrapper;
- 
-     /* bgrxtoYV12 */
-@@ -2136,17 +2135,17 @@ void ff_get_unscaled_swscale(SwsContext *c)
-     /* rgbx24toYV12 */
-     if (((srcFormat == AV_PIX_FMT_RGBA && dstFormat == AV_PIX_FMT_YUV420P) ||
-          (srcFormat == AV_PIX_FMT_RGB0 && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
--        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        !(flags & SWS_ACCURATE_RND))
-         c->convert_unscaled = rgbxToYv12Wrapper;
-     /* xbgrtoYV12 */
-     if (((srcFormat == AV_PIX_FMT_ABGR && dstFormat == AV_PIX_FMT_YUV420P) ||
-          (srcFormat == AV_PIX_FMT_0BGR && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
--        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        !(flags & SWS_ACCURATE_RND))
-         c->convert_unscaled = xbgrToYv12Wrapper;
-     /* xrgb24toYV12 */
-     if (((srcFormat == AV_PIX_FMT_ARGB && dstFormat == AV_PIX_FMT_YUV420P) ||
-          (srcFormat == AV_PIX_FMT_0RGB && (dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))) &&
--        !(flags & SWS_ACCURATE_RND) && !(dstW&1))
-+        !(flags & SWS_ACCURATE_RND))
-         c->convert_unscaled = xrgbToYv12Wrapper;
- 
-     /* RGB/BGR -> RGB/BGR (no dither needed forms) */
-
-From 68c6482d9473ce774e87cac2455a8c7b3e2d99b4 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Thu, 4 May 2023 14:26:14 +0000
-Subject: [PATCH 131/136] rtpenc: Add code to send H264 new extradata in
- sidedata
-
-Fixes issue with pi V4L2 H264 encode which cannot create extradata
-at init time.
-
-(cherry picked from commit 4f852b4b093f841b64b4934a6f1720e98e4e0f2c)
----
- libavformat/rtpenc.c | 18 ++++++++++++++++++
- 1 file changed, 18 insertions(+)
-
-diff --git a/libavformat/rtpenc.c b/libavformat/rtpenc.c
-index a8d296a154..f67dc2a15a 100644
---- a/libavformat/rtpenc.c
-+++ b/libavformat/rtpenc.c
-@@ -19,6 +19,7 @@
-  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-  */
- 
-+#include "avc.h"
- #include "avformat.h"
- #include "mpegts.h"
- #include "internal.h"
-@@ -585,8 +586,25 @@ static int rtp_write_packet(AVFormatContext *s1, AVPacket *pkt)
-         ff_rtp_send_vc2hq(s1, pkt->data, size, st->codecpar->field_order != AV_FIELD_PROGRESSIVE ? 1 : 0);
-         break;
-     case AV_CODEC_ID_H264:
-+    {
-+        uint8_t *side_data;
-+        int side_data_size = 0;
-+
-+        side_data = av_packet_get_side_data(pkt, AV_PKT_DATA_NEW_EXTRADATA,
-+                                            &side_data_size);
-+
-+        if (side_data_size != 0) {
-+            int ps_size = side_data_size;
-+            uint8_t * ps_buf = NULL;
-+
-+            ff_avc_write_annexb_extradata(side_data, &ps_buf, &ps_size);
-+            av_log(s1, AV_LOG_TRACE, "H264: write side data=%d\n", ps_size);
-+            ff_rtp_send_h264_hevc(s1, ps_buf ? ps_buf : side_data, ps_size);
-+            av_free(ps_buf);
-+        }
-         ff_rtp_send_h264_hevc(s1, pkt->data, size);
-         break;
-+    }
-     case AV_CODEC_ID_H261:
-         ff_rtp_send_h261(s1, pkt->data, size);
-         break;
-
-From 5240cc7fc3abed8af5f178c5461ca9fe11a7d5e4 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Mon, 5 Jun 2023 08:34:38 +0000
-Subject: [PATCH 132/136] rgb2rgb: Fix luma narrow+saturation instruction
-
-(cherry picked from commit 9cdac1c08ad5c0aea28907d1d3fd0bdda387955a)
----
- libswscale/aarch64/rgb2rgb_neon.S | 16 ++++++++--------
- 1 file changed, 8 insertions(+), 8 deletions(-)
-
-diff --git a/libswscale/aarch64/rgb2rgb_neon.S b/libswscale/aarch64/rgb2rgb_neon.S
-index 978ab443ea..476ca723a0 100644
---- a/libswscale/aarch64/rgb2rgb_neon.S
-+++ b/libswscale/aarch64/rgb2rgb_neon.S
-@@ -203,11 +203,11 @@ function ff_bgr24toyv12_aarch64, export=1
-         shrn            v26.4h, v26.4s, #12
-         shrn2           v26.8h, v27.4s, #12
-         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
--        uqrshrn         v26.8b, v26.8h, #3
-+        sqrshrun        v26.8b, v26.8h, #3
-         shrn            v28.4h, v28.4s, #12
-         shrn2           v28.8h, v29.4s, #12
-         add             v28.8h, v28.8h, v6.8h
--        uqrshrn2        v26.16b, v28.8h, #3
-+        sqrshrun2       v26.16b, v28.8h, #3
-         // Y0/Y1
- 
-         // U
-@@ -282,11 +282,11 @@ function ff_bgr24toyv12_aarch64, export=1
-         shrn            v26.4h, v26.4s, #12
-         shrn2           v26.8h, v27.4s, #12
-         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
--        uqrshrn         v26.8b, v26.8h, #3
-+        sqrshrun        v26.8b, v26.8h, #3
-         shrn            v28.4h, v28.4s, #12
-         shrn2           v28.8h, v29.4s, #12
-         add             v28.8h, v28.8h, v6.8h
--        uqrshrn2        v26.16b, v28.8h, #3
-+        sqrshrun2       v26.16b, v28.8h, #3
-         // Y0/Y1
- 
-         // U
-@@ -416,11 +416,11 @@ function ff_bgr24toyv12_aarch64, export=1
-         shrn            v26.4h, v26.4s, #12
-         shrn2           v26.8h, v27.4s, #12
-         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
--        uqrshrn         v26.8b, v26.8h, #3
-+        sqrshrun        v26.8b, v26.8h, #3
-         shrn            v28.4h, v28.4s, #12
-         shrn2           v28.8h, v29.4s, #12
-         add             v28.8h, v28.8h, v6.8h
--        uqrshrn2        v26.16b, v28.8h, #3
-+        sqrshrun2       v26.16b, v28.8h, #3
-         // Y0/Y1
- 
-         subs            w9, w9, #16
-@@ -464,11 +464,11 @@ function ff_bgr24toyv12_aarch64, export=1
-         shrn            v26.4h, v26.4s, #12
-         shrn2           v26.8h, v27.4s, #12
-         add             v26.8h, v26.8h, v6.8h     // +128 (>> 3 = 16)
--        uqrshrn         v26.8b, v26.8h, #3
-+        sqrshrun        v26.8b, v26.8h, #3
-         shrn            v28.4h, v28.4s, #12
-         shrn2           v28.8h, v29.4s, #12
-         add             v28.8h, v28.8h, v6.8h
--        uqrshrn2        v26.16b, v28.8h, #3
-+        sqrshrun2       v26.16b, v28.8h, #3
-         // Y0/Y1
- 
-         // Here:
-
-From 9474d9d227f2af488d5d2bd614c5c707479ca3c3 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Sun, 4 Jun 2023 13:37:59 +0000
-Subject: [PATCH 133/136] v4l2_m2m_dec: Tweak pending count to use dts &
- reorder size
-
-(cherry picked from commit ca438b382c90f9a5f58f4708205e6ac25395db2a)
----
- libavcodec/v4l2_m2m.h     |  1 +
- libavcodec/v4l2_m2m_dec.c | 53 +++++++++++++++++++++++++++++++--------
- 2 files changed, 43 insertions(+), 11 deletions(-)
-
-diff --git a/libavcodec/v4l2_m2m.h b/libavcodec/v4l2_m2m.h
-index ded1478a49..a506e69d67 100644
---- a/libavcodec/v4l2_m2m.h
-+++ b/libavcodec/v4l2_m2m.h
-@@ -115,6 +115,7 @@ typedef struct V4L2m2mContext {
- 
-     /* req pkt */
-     int req_pkt;
-+    int reorder_size;
- 
-     /* Ext data sent */
-     int extdata_sent;
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index d124c7b1fc..13af62e819 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -121,13 +121,18 @@ log_dump(void * logctx, int lvl, const void * const data, const size_t len)
- }
- #endif
- 
--static int64_t pts_stats_guess(const pts_stats_t * const stats)
-+static unsigned int pts_stats_interval(const pts_stats_t * const stats)
-+{
-+    return stats->last_interval;
-+}
-+
-+static int64_t pts_stats_guess(const pts_stats_t * const stats, const int fail_bad_guess)
- {
-     if (stats->last_count <= 1)
-         return stats->last_pts;
-     if (stats->last_pts == AV_NOPTS_VALUE ||
--            stats->last_interval == 0 ||
--            stats->last_count >= STATS_LAST_COUNT_MAX)
-+            fail_bad_guess && (stats->last_interval == 0 ||
-+                               stats->last_count >= STATS_LAST_COUNT_MAX))
-         return AV_NOPTS_VALUE;
-     return stats->last_pts + (int64_t)(stats->last_count - 1) * (int64_t)stats->last_interval;
- }
-@@ -345,7 +350,7 @@ set_best_effort_pts(AVCodecContext *const avctx,
- {
-     pts_stats_add(ps, frame->pts);
- 
--    frame->best_effort_timestamp = pts_stats_guess(ps);
-+    frame->best_effort_timestamp = pts_stats_guess(ps, 1);
-     // If we can't guess from just PTS - try DTS
-     if (frame->best_effort_timestamp == AV_NOPTS_VALUE)
-         frame->best_effort_timestamp = frame->pkt_dts;
-@@ -380,15 +385,25 @@ xlat_init(xlat_track_t * const x)
- }
- 
- static int
--xlat_pending(const xlat_track_t * const x)
-+xlat_pending(const V4L2m2mContext * const s)
- {
-+    const xlat_track_t *const x = &s->xlat;
-     unsigned int n = x->track_no % FF_V4L2_M2M_TRACK_SIZE;
-     int i;
--    const int64_t now = x->last_pts;
-+    const int64_t now = pts_stats_guess(&s->pts_stat, 0);
-+    int64_t first_dts = AV_NOPTS_VALUE;
-+    int no_dts_count = 0;
-+    unsigned int interval = pts_stats_interval(&s->pts_stat);
- 
-     for (i = 0; i < FF_V4L2_M2M_TRACK_SIZE; ++i, n = (n - 1) & (FF_V4L2_M2M_TRACK_SIZE - 1)) {
-         const V4L2m2mTrackEl * const t = x->track_els + n;
- 
-+        if (first_dts == AV_NOPTS_VALUE)
-+            if (t->dts == AV_NOPTS_VALUE)
-+                ++no_dts_count;
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p *= 64;
++            if (a->height == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [j];
 +            else
-+                first_dts = t->dts;
-+
-         // Discard only set on never-set or flushed entries
-         // So if we get here we've never successfully decoded a frame so allow
-         // more frames into the buffer before stalling
-@@ -408,6 +423,18 @@ xlat_pending(const xlat_track_t * const x)
-             break;
-     }
- 
-+    if (first_dts != AV_NOPTS_VALUE && now != AV_NOPTS_VALUE && interval != 0 && s->reorder_size != 0) {
-+        const int iframes = (first_dts - now) / (int)interval;
-+        const int t = iframes - s->reorder_size + no_dts_count;
-+
-+//        av_log(s->avctx, AV_LOG_DEBUG, "Last:%"PRId64", Now:%"PRId64", First:%"PRId64", delta=%"PRId64", frames=%d, nodts=%d\n",
-+//               x->last_dts, now, first_dts, first_dts - now, iframes, no_dts_count);
-+
-+        if (iframes > 0 && iframes < 64 && t < i) {
-+            return t;
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [j];
++            if (a->width == 4)
++                *p /= (const unsigned[]) { 289, 292, 289, 292 } [i];
++            else
++                *p /= (const unsigned[]) { 288, 289, 292, 289, 288, 289, 292, 289 } [i];
 +        }
-+    }
-+
-     return i;
- }
- 
-@@ -585,12 +612,12 @@ static int qbuf_wait(AVCodecContext * const avctx, V4L2Context * const ctx)
- static int v4l2_receive_frame(AVCodecContext *avctx, AVFrame *frame)
- {
-     V4L2m2mContext *const s = ((V4L2m2mPriv*)avctx->priv_data)->context;
--    int src_rv = NQ_OK;
-+    int src_rv = -1;
-     int dst_rv = 1;  // Non-zero (done), non-negative (error) number
-     unsigned int i = 0;
- 
-     do {
--        const int pending = xlat_pending(&s->xlat);
-+        const int pending = xlat_pending(s);
-         const int prefer_dq = (pending > 4);
-         const int last_src_rv = src_rv;
- 
-@@ -966,8 +993,10 @@ static uint32_t max_coded_size(const AVCodecContext * const avctx)
- }
- 
- static void
--parse_extradata(AVCodecContext *avctx)
-+parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
- {
-+    s->reorder_size = 0;
-+
-     if (!avctx->extradata || !avctx->extradata_size)
-         return;
- 
-@@ -996,6 +1025,7 @@ parse_extradata(AVCodecContext *avctx)
-                     avctx->profile = ff_h264_get_profile(sps);
-                     avctx->level = sps->level_idc;
-                 }
-+                s->reorder_size = sps->num_reorder_frames;
-             }
-             ff_h264_ps_uninit(&ps);
-             break;
-@@ -1025,6 +1055,7 @@ parse_extradata(AVCodecContext *avctx)
-                 if (sps) {
-                     avctx->profile = sps->ptl.general_ptl.profile_idc;
-                     avctx->level   = sps->ptl.general_ptl.level_idc;
-+                    s->reorder_size = sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering;
-                 }
-             }
-             ff_hevc_ps_uninit(&ps);
-@@ -1057,12 +1088,12 @@ static av_cold int v4l2_decode_init(AVCodecContext *avctx)
-         avctx->ticks_per_frame = 2;
-     }
- 
--    parse_extradata(avctx);
--
-     ret = ff_v4l2_m2m_create_context(priv, &s);
-     if (ret < 0)
-         return ret;
- 
-+    parse_extradata(avctx, s);
-+
-     xlat_init(&s->xlat);
-     pts_stats_init(&s->pts_stat, avctx, "decoder");
- 
-
-From 2145b9c9177f0fe9569ce39e2d4eb629caf8bd47 Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Wed, 7 Jun 2023 11:14:52 +0000
-Subject: [PATCH 134/136] v4l2_m2m: Add encode size check
-
-Previously an out of bounds size would fail whilst trying to copy the
-buffer with an unhelpful message. This produces a better error at init
-time.
-
-(cherry picked from commit 0b61c4617e26f043d28d44c8767f7b9fd4882f97)
----
- libavcodec/v4l2_m2m.c | 43 +++++++++++++++++++++++++++++++++++++++++++
- 1 file changed, 43 insertions(+)
-
-diff --git a/libavcodec/v4l2_m2m.c b/libavcodec/v4l2_m2m.c
-index f802687b1b..28d9ed4988 100644
---- a/libavcodec/v4l2_m2m.c
-+++ b/libavcodec/v4l2_m2m.c
-@@ -109,6 +109,44 @@ static int v4l2_prepare_contexts(V4L2m2mContext *s, int probe)
-     return AVERROR(EINVAL);
- }
- 
-+static int check_size(AVCodecContext * const avctx, V4L2m2mContext * const s)
-+{
-+    struct v4l2_format fmt = {.type = s->output.type};
-+    int rv;
-+    uint32_t pixfmt = ff_v4l2_format_avfmt_to_v4l2(avctx->pix_fmt);
-+    unsigned int w;
-+    unsigned int h;
-+
-+    if (V4L2_TYPE_IS_MULTIPLANAR(fmt.type)) {
-+        fmt.fmt.pix_mp.pixelformat = pixfmt;
-+        fmt.fmt.pix_mp.width = avctx->width;
-+        fmt.fmt.pix_mp.height = avctx->height;
-+    }
-+    else {
-+        fmt.fmt.pix.pixelformat = pixfmt;
-+        fmt.fmt.pix.width = avctx->width;
-+        fmt.fmt.pix.height = avctx->height;
-+    }
-+
-+    rv = ioctl(s->fd, VIDIOC_TRY_FMT, &fmt);
-+
-+    if (rv != 0) {
-+        rv = AVERROR(errno);
-+        av_log(avctx, AV_LOG_ERROR, "%s: Tryfmt failed: %s\n", __func__, av_err2str(rv));
-+        return rv;
-+    }
-+
-+    w = ff_v4l2_get_format_width(&fmt);
-+    h = ff_v4l2_get_format_height(&fmt);
-+
-+    if (w < avctx->width || h < avctx->height) {
-+        av_log(avctx, AV_LOG_WARNING, "%s: Size check failed: asked for %dx%d, got: %dx%d\n", __func__, avctx->width, avctx->height, w, h);
-+        return AVERROR(EINVAL);
-+    }
-+
-+    return 0;
 +}
 +
- static int v4l2_probe_driver(V4L2m2mContext *s)
- {
-     void *log_ctx = s->avctx;
-@@ -128,6 +166,11 @@ static int v4l2_probe_driver(V4L2m2mContext *s)
-         goto done;
-     }
- 
-+    // If being given frames (encode) check that V4L2 can cope with the size
-+    if (s->output.av_codec_id == AV_CODEC_ID_RAWVIDEO &&
-+        (ret = check_size(s->avctx, s)) != 0)
-+        goto done;
++static void divide_and_round_nearest(matrix *a, float by)
++{
++    for (int j = 0; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p = rintf(*p / by);
++        }
++}
 +
-     ret = ff_v4l2_context_get_format(&s->capture, 1);
-     if (ret) {
-         av_log(log_ctx, AV_LOG_DEBUG, "v4l2 capture format not supported\n");
-
-From 805985ea191c98885a74dbf994b1ca11551cd81e Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Fri, 9 Jun 2023 10:28:12 +0000
-Subject: [PATCH 135/136] vf_bwdif: Add attributes to ask for vectorization
-
-(cherry picked from commit 281250290ba5c2dcd8676e9a261050e65c10bcb7)
----
- libavfilter/vf_bwdif.c | 29 +++++++++++++++--------------
- 1 file changed, 15 insertions(+), 14 deletions(-)
-
-diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
-index 65c617ebb3..09e68523bb 100644
---- a/libavfilter/vf_bwdif.c
-+++ b/libavfilter/vf_bwdif.c
-@@ -74,10 +74,10 @@ typedef struct ThreadData {
-         int temporal_diff1 =(FFABS(prev[mrefs] - c) + FFABS(prev[prefs] - e)) >> 1; \
-         int temporal_diff2 =(FFABS(next[mrefs] - c) + FFABS(next[prefs] - e)) >> 1; \
-         int diff = FFMAX3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2); \
-- \
-+ {/*\
-         if (!diff) { \
-             dst[0] = d; \
--        } else {
-+        } else {*/
- 
- #define SPAT_CHECK() \
-             int b = ((prev2[mrefs2] + next2[mrefs2]) >> 1) - c; \
-@@ -89,15 +89,16 @@ typedef struct ThreadData {
-             diff = FFMAX3(diff, min, -max);
- 
- #define FILTER_LINE() \
-+            int i1, i2; \
-             SPAT_CHECK() \
--            if (FFABS(c - e) > temporal_diff0) { \
--                interpol = (((coef_hf[0] * (prev2[0] + next2[0]) \
-+            /*if (FFABS(c - e) > temporal_diff0)*/ { \
-+                i1 = (((coef_hf[0] * (prev2[0] + next2[0]) \
-                     - coef_hf[1] * (prev2[mrefs2] + next2[mrefs2] + prev2[prefs2] + next2[prefs2]) \
-                     + coef_hf[2] * (prev2[mrefs4] + next2[mrefs4] + prev2[prefs4] + next2[prefs4])) >> 2) \
-                     + coef_lf[0] * (c + e) - coef_lf[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
--            } else { \
--                interpol = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
--            }
-+            } /*else*/ { \
-+                i2 = (coef_sp[0] * (c + e) - coef_sp[1] * (cur[mrefs3] + cur[prefs3])) >> 13; \
-+            }interpol = FFABS(c - e) > temporal_diff0 ? i1:i2;\
- 
- #define FILTER_EDGE() \
-             if (spat) { \
-@@ -111,7 +112,7 @@ typedef struct ThreadData {
-             else if (interpol < d - diff) \
-                 interpol = d - diff; \
-  \
--            dst[0] = av_clip(interpol, 0, clip_max); \
-+            dst[0] = !diff ? d : av_clip(interpol, 0, clip_max); \
-         } \
-  \
-         dst++; \
-@@ -122,7 +123,7 @@ typedef struct ThreadData {
-         next2++; \
-     }
- 
--static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-+static void __attribute__((optimize("tree-vectorize"))) filter_intra(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
-                          int prefs3, int mrefs3, int parity, int clip_max)
- {
-     uint8_t *dst = dst1;
-@@ -132,7 +133,7 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
-     FILTER_INTRA()
- }
- 
--static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_line_c(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                           int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                           int prefs3, int mrefs3, int prefs4, int mrefs4,
-                           int parity, int clip_max)
-@@ -150,7 +151,7 @@ static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
-     FILTER2()
- }
- 
--static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_edge(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                         int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                         int parity, int clip_max, int spat)
- {
-@@ -167,7 +168,7 @@ static void filter_edge(void *dst1, void *prev1, void *cur1, void *next1,
-     FILTER2()
- }
- 
--static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mrefs,
-+static void __attribute__((optimize("tree-vectorize"))) filter_intra_16bit(void *restrict dst1, void *restrict cur1, int w, int prefs, int mrefs,
-                                int prefs3, int mrefs3, int parity, int clip_max)
- {
-     uint16_t *dst = dst1;
-@@ -177,7 +178,7 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
-     FILTER_INTRA()
- }
- 
--static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_line_c_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                                 int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                                 int prefs3, int mrefs3, int prefs4, int mrefs4,
-                                 int parity, int clip_max)
-@@ -195,7 +196,7 @@ static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1
-     FILTER2()
- }
- 
--static void filter_edge_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-+static void __attribute__((optimize("tree-vectorize"))) filter_edge_16bit(void *restrict dst1, void *restrict prev1, void *restrict cur1, void *restrict next1,
-                               int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                               int parity, int clip_max, int spat)
- {
-
-From f4012f09da1c57a0aa5db01f9096992d0c385f7b Mon Sep 17 00:00:00 2001
-From: John Cox <jc@kynesim.co.uk>
-Date: Tue, 13 Jun 2023 13:07:55 +0000
-Subject: [PATCH 136/136] v4l2m2m_dec: Fix h264 reorder size if no sps
- initially
-
-(cherry picked from commit 8832f7924bf47cbca0de251d7b406917f958ebf4)
----
- libavcodec/v4l2_m2m_dec.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/libavcodec/v4l2_m2m_dec.c b/libavcodec/v4l2_m2m_dec.c
-index 13af62e819..11c83b2d66 100644
---- a/libavcodec/v4l2_m2m_dec.c
-+++ b/libavcodec/v4l2_m2m_dec.c
-@@ -1024,8 +1024,8 @@ parse_extradata(AVCodecContext * const avctx, V4L2m2mContext * const s)
-                 if (sps) {
-                     avctx->profile = ff_h264_get_profile(sps);
-                     avctx->level = sps->level_idc;
-+                    s->reorder_size = sps->num_reorder_frames;
-                 }
--                s->reorder_size = sps->num_reorder_frames;
-             }
-             ff_h264_ps_uninit(&ps);
-             break;
++static void tweak(matrix *a)
++{
++    for (int j = 4; j < a->height; ++j)
++        for (int i = 0; i < a->width; ++i) {
++            float *p = a->d + j * a->width + i;
++            *p += 1;
++        }
++}
++
++/* The VC-1 spec places restrictions on the values permitted at three
++ * different stages:
++ * - D: the input coefficients in frequency domain
++ * - E: the intermediate coefficients, inverse-transformed only horizontally
++ * - R: the fully inverse-transformed coefficients
++ *
++ * To fully cater for the ranges specified requires various intermediate
++ * values to be held to 17-bit precision; yet these conditions do not appear
++ * to be utilised in real-world streams. At least some assembly
++ * implementations have chosen to restrict these values to 16-bit precision,
++ * to accelerate the decoding of real-world streams at the cost of strict
++ * adherence to the spec. To avoid our test marking these as failures,
++ * reduce our random inputs.
++ */
++#define ATTENUATION 4
++
++static matrix *generate_inverse_quantized_transform_coefficients(size_t width, size_t height)
++{
++    matrix *raw, *tmp, *D, *E, *R;
++    raw = new_matrix(width, height);
++    for (int i = 0; i < width * height; ++i)
++        raw->d[i] = (int) (rnd() % (1024/ATTENUATION)) - 512/ATTENUATION;
++    tmp = multiply(height == 8 ? &T8 : &T4, raw);
++    D = multiply(tmp, width == 8 ? &T8t : &T4t);
++    normalise(D);
++    divide_and_round_nearest(D, 1);
++    for (int i = 0; i < width * height; ++i) {
++        if (D->d[i] < -2048/ATTENUATION || D->d[i] > 2048/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    }
++    E = multiply(D, width == 8 ? &T8 : &T4);
++    divide_and_round_nearest(E, 8);
++    for (int i = 0; i < width * height; ++i)
++        if (E->d[i] < -4096/ATTENUATION || E->d[i] > 4096/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    R = multiply(height == 8 ? &T8t : &T4t, E);
++    tweak(R);
++    divide_and_round_nearest(R, 128);
++    for (int i = 0; i < width * height; ++i)
++        if (R->d[i] < -512/ATTENUATION || R->d[i] > 512/ATTENUATION-1) {
++            /* Rare, so simply try again */
++            av_free(raw);
++            av_free(tmp);
++            av_free(D);
++            av_free(E);
++            av_free(R);
++            return generate_inverse_quantized_transform_coefficients(width, height);
++        }
++    av_free(raw);
++    av_free(tmp);
++    av_free(E);
++    av_free(R);
++    return D;
++}
++
++#define RANDOMIZE_BUFFER16(name, size)        \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint16_t r = rnd();               \
++            AV_WN16A(name##0 + i, r);         \
++            AV_WN16A(name##1 + i, r);         \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8(name, size)         \
++    do {                                      \
++        int i;                                \
++        for (i = 0; i < size; ++i) {          \
++            uint8_t r = rnd();                \
++            name##0[i] = r;                   \
++            name##1[i] = r;                   \
++        }                                     \
++    } while (0)
++
++#define RANDOMIZE_BUFFER8_MID_WEIGHTED(name, size)  \
++    do {                                            \
++        uint8_t *p##0 = name##0, *p##1 = name##1;   \
++        int i = (size);                             \
++        while (i-- > 0) {                           \
++            int x = 0x80 | (rnd() & 0x7F);          \
++            x >>= rnd() % 9;                        \
++            if (rnd() & 1)                          \
++                x = -x;                             \
++            *p##1++ = *p##0++ = 0x80 + x;           \
++        }                                           \
++    } while (0)
++
++static void check_inv_trans_inplace(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size.
++     * vc1_inv_trans_8x8 differs from the others in two ways: coefficients
++     * are stored in column-major order, and the outputs are written back
++     * to the input buffer, so we oversize it slightly to catch overruns. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [10 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [10 * 8]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_inv_trans_8x8, "vc1dsp.vc1_inv_trans_8x8")) {
++        matrix *coeffs;
++        declare_func_emms(AV_CPU_FLAG_MMX, void, int16_t *);
++        RANDOMIZE_BUFFER16(inv_trans_in, 10 * 8);
++        coeffs = generate_inverse_quantized_transform_coefficients(8, 8);
++        for (int j = 0; j < 8; ++j)
++            for (int i = 0; i < 8; ++i) {
++                int idx = 8 + i * 8 + j;
++                inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * 8 + i];
++            }
++        call_ref(inv_trans_in0 + 8);
++        call_new(inv_trans_in1 + 8);
++        if (memcmp(inv_trans_in0,  inv_trans_in1,  10 * 8 * sizeof (int16_t)))
++            fail();
++        bench_new(inv_trans_in1 + 8);
++        av_free(coeffs);
++    }
++}
++
++static void check_inv_trans_adding(void)
++{
++    /* Inverse transform input coefficients are stored in a 16-bit buffer
++     * with row stride of 8 coefficients irrespective of transform size. */
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in0, [8 * 8]);
++    LOCAL_ALIGNED_16(int16_t, inv_trans_in1, [8 * 8]);
++
++    /* For all but vc1_inv_trans_8x8, the inverse transform is narrowed and
++     * added with saturation to an array of unsigned 8-bit values. Oversize
++     * this by 8 samples left and right and one row above and below. */
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out0, [10 * 24]);
++    LOCAL_ALIGNED_8(uint8_t, inv_trans_out1, [10 * 24]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4, 4, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x8_dc, 8, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_8x4_dc, 8, 4)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x8_dc, 4, 8)
++        VC1DSP_SIZED_TEST(vc1_inv_trans_4x4_dc, 4, 4)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int16_t *) = *(void **)((intptr_t) &h + tests[t].offset);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            matrix *coeffs;
++            declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int16_t *);
++            RANDOMIZE_BUFFER16(inv_trans_in, 8 * 8);
++            RANDOMIZE_BUFFER8(inv_trans_out, 10 * 24);
++            coeffs = generate_inverse_quantized_transform_coefficients(tests[t].width, tests[t].height);
++            for (int j = 0; j < tests[t].height; ++j)
++                for (int i = 0; i < tests[t].width; ++i) {
++                    int idx = j * 8 + i;
++                    inv_trans_in1[idx] = inv_trans_in0[idx] = coeffs->d[j * tests[t].width + i];
++                }
++            call_ref(inv_trans_out0 + 24 + 8, 24, inv_trans_in0);
++            call_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1);
++            if (memcmp(inv_trans_out0, inv_trans_out1, 10 * 24))
++                fail();
++            bench_new(inv_trans_out1 + 24 + 8, 24, inv_trans_in1 + 8);
++            av_free(coeffs);
++        }
++    }
++}
++
++static void check_loop_filter(void)
++{
++    /* Deblocking filter buffers are big enough to hold a 16x16 block,
++     * plus 16 columns left and 4 rows above to hold filter inputs
++     * (depending on whether v or h neighbouring block edge, oversized
++     * horizontally to maintain 16-byte alignment) plus 16 columns and
++     * 4 rows below to catch write overflows */
++    LOCAL_ALIGNED_16(uint8_t, filter_buf0, [24 * 48]);
++    LOCAL_ALIGNED_16(uint8_t, filter_buf1, [24 * 48]);
++
++    VC1DSPContext h;
++
++    const test tests[] = {
++        VC1DSP_TEST(vc1_v_loop_filter4)
++        VC1DSP_TEST(vc1_h_loop_filter4)
++        VC1DSP_TEST(vc1_v_loop_filter8)
++        VC1DSP_TEST(vc1_h_loop_filter8)
++        VC1DSP_TEST(vc1_v_loop_filter16)
++        VC1DSP_TEST(vc1_h_loop_filter16)
++    };
++
++    ff_vc1dsp_init(&h);
++
++    for (size_t t = 0; t < FF_ARRAY_ELEMS(tests); ++t) {
++        void (*func)(uint8_t *, ptrdiff_t, int) = *(void **)((intptr_t) &h + tests[t].offset);
++        declare_func_emms(AV_CPU_FLAG_MMX, void, uint8_t *, ptrdiff_t, int);
++        if (check_func(func, "vc1dsp.%s", tests[t].name)) {
++            for (int count = 1000; count > 0; --count) {
++                int pq = rnd() % 31 + 1;
++                RANDOMIZE_BUFFER8_MID_WEIGHTED(filter_buf, 24 * 48);
++                call_ref(filter_buf0 + 4 * 48 + 16, 48, pq);
++                call_new(filter_buf1 + 4 * 48 + 16, 48, pq);
++                if (memcmp(filter_buf0, filter_buf1, 24 * 48))
++                    fail();
++            }
++        }
++        for (int j = 0; j < 24; ++j)
++            for (int i = 0; i < 48; ++i)
++                filter_buf1[j * 48 + i] = 0x60 + 0x40 * (i >= 16 && j >= 4);
++        if (check_func(func, "vc1dsp.%s_bestcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 1);
++        if (check_func(func, "vc1dsp.%s_worstcase", tests[t].name))
++            bench_new(filter_buf1 + 4 * 48 + 16, 48, 31);
++    }
++}
++
++#define TEST_UNESCAPE                                                                               \
++    do {                                                                                            \
++        for (int count = 100; count > 0; --count) {                                                 \
++            escaped_offset = rnd() & 7;                                                             \
++            unescaped_offset = rnd() & 7;                                                           \
++            escaped_len = (1u << (rnd() % 8) + 3) - (rnd() & 7);                                    \
++            RANDOMIZE_BUFFER8(unescaped, UNESCAPE_BUF_SIZE);                                        \
++            len0 = call_ref(escaped0 + escaped_offset, escaped_len, unescaped0 + unescaped_offset); \
++            len1 = call_new(escaped1 + escaped_offset, escaped_len, unescaped1 + unescaped_offset); \
++            if (len0 != len1 || memcmp(unescaped0, unescaped1, UNESCAPE_BUF_SIZE))                  \
++                fail();                                                                             \
++        }                                                                                           \
++    } while (0)
++
++static void check_unescape(void)
++{
++    /* This appears to be a typical length of buffer in use */
++#define LOG2_UNESCAPE_BUF_SIZE 17
++#define UNESCAPE_BUF_SIZE (1u<<LOG2_UNESCAPE_BUF_SIZE)
++    LOCAL_ALIGNED_8(uint8_t, escaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, escaped1, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped0, [UNESCAPE_BUF_SIZE]);
++    LOCAL_ALIGNED_8(uint8_t, unescaped1, [UNESCAPE_BUF_SIZE]);
++
++    VC1DSPContext h;
++
++    ff_vc1dsp_init(&h);
++
++    if (check_func(h.vc1_unescape_buffer, "vc1dsp.vc1_unescape_buffer")) {
++        int len0, len1, escaped_offset, unescaped_offset, escaped_len;
++        declare_func_emms(AV_CPU_FLAG_MMX, int, const uint8_t *, int, uint8_t *);
++
++        /* Test data which consists of escapes sequences packed as tightly as possible */
++        for (int x = 0; x < UNESCAPE_BUF_SIZE; ++x)
++            escaped1[x] = escaped0[x] = 3 * (x % 3 == 0);
++        TEST_UNESCAPE;
++
++        /* Test random data */
++        RANDOMIZE_BUFFER8(escaped, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Test data with escape sequences at random intervals */
++        for (int x = 0; x <= UNESCAPE_BUF_SIZE - 4;) {
++            int gap, gap_msb;
++            escaped1[x+0] = escaped0[x+0] = 0;
++            escaped1[x+1] = escaped0[x+1] = 0;
++            escaped1[x+2] = escaped0[x+2] = 3;
++            escaped1[x+3] = escaped0[x+3] = rnd() & 3;
++            gap_msb = 2u << (rnd() % 8);
++            gap = (rnd() &~ -gap_msb) | gap_msb;
++            x += gap;
++        }
++        TEST_UNESCAPE;
++
++        /* Test data which is known to contain no escape sequences */
++        memset(escaped0, 0xFF, UNESCAPE_BUF_SIZE);
++        memset(escaped1, 0xFF, UNESCAPE_BUF_SIZE);
++        TEST_UNESCAPE;
++
++        /* Benchmark the no-escape-sequences case */
++        bench_new(escaped1, UNESCAPE_BUF_SIZE, unescaped1);
++    }
++}
++
++void checkasm_check_vc1dsp(void)
++{
++    check_inv_trans_inplace();
++    check_inv_trans_adding();
++    report("inv_trans");
++
++    check_loop_filter();
++    report("loop_filter");
++
++    check_unescape();
++    report("unescape_buffer");
++}
+diff --git a/tests/checkasm/vf_bwdif.c b/tests/checkasm/vf_bwdif.c
+new file mode 100644
+index 0000000000..3399cacdf7
+--- /dev/null
++++ b/tests/checkasm/vf_bwdif.c
+@@ -0,0 +1,256 @@
++/*
++ * This file is part of FFmpeg.
++ *
++ * FFmpeg is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * FFmpeg is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include <string.h>
++#include "checkasm.h"
++#include "libavcodec/internal.h"
++#include "libavfilter/bwdif.h"
++#include "libavutil/mem_internal.h"
++
++#define WIDTH 256
++
++#define randomize_buffers(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = rnd() & mask
++
++#define randomize_overflow_check(buf0, buf1, mask, count) \
++    for (size_t i = 0; i < count; i++) \
++        buf0[i] = buf1[i] = (rnd() & 1) != 0 ? mask : 0;
++
++#define BODY(type, depth)                                                      \
++    do {                                                                       \
++        type prev0[9*WIDTH], prev1[9*WIDTH];                                   \
++        type next0[9*WIDTH], next1[9*WIDTH];                                   \
++        type cur0[9*WIDTH], cur1[9*WIDTH];                                     \
++        type dst0[WIDTH], dst1[WIDTH];                                         \
++        const int stride = WIDTH;                                              \
++        const int mask = (1<<depth)-1;                                         \
++                                                                               \
++        declare_func(void, void *dst, void *prev, void *cur, void *next,       \
++                        int w, int prefs, int mrefs, int prefs2, int mrefs2,   \
++                        int prefs3, int mrefs3, int prefs4, int mrefs4,        \
++                        int parity, int clip_max);                             \
++                                                                               \
++        randomize_buffers(prev0, prev1, mask, 9*WIDTH);                        \
++        randomize_buffers(next0, next1, mask, 9*WIDTH);                        \
++        randomize_buffers( cur0,  cur1, mask, 9*WIDTH);                        \
++                                                                               \
++        call_ref(dst0, prev0 + 4*WIDTH, cur0 + 4*WIDTH, next0 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++        call_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,       \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++                                                                               \
++        if (memcmp(dst0, dst1, sizeof dst0)                                    \
++                || memcmp(prev0, prev1, sizeof prev0)                          \
++                || memcmp(next0, next1, sizeof next0)                          \
++                || memcmp( cur0,  cur1, sizeof cur0))                          \
++            fail();                                                            \
++        bench_new(dst1, prev1 + 4*WIDTH, cur1 + 4*WIDTH, next1 + 4*WIDTH,      \
++                WIDTH, stride, -stride, 2*stride, -2*stride,                   \
++                3*stride, -3*stride, 4*stride, -4*stride,                      \
++                0, mask);                                                      \
++    } while (0)
++
++void checkasm_check_vf_bwdif(void)
++{
++    BWDIFContext ctx_8, ctx_10;
++
++    ff_bwdif_init_filter_line(&ctx_8, 8);
++    ff_bwdif_init_filter_line(&ctx_10, 10);
++
++    if (check_func(ctx_8.filter_line, "bwdif8")) {
++        BODY(uint8_t, 8);
++        report("bwdif8");
++    }
++
++    if (check_func(ctx_10.filter_line, "bwdif10")) {
++        BODY(uint16_t, 10);
++        report("bwdif10");
++    }
++
++    if (!ctx_8.filter_line3)
++        ctx_8.filter_line3 = ff_bwdif_filter_line3_c;
++
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int parity;
++
++        for (parity = 0; parity != 2; ++parity) {
++            if (check_func(ctx_8.filter_line3, "bwdif8.line3.rnd.p%d", parity)) {
++
++                declare_func(void, void * dst1, int d_stride,
++                                          const void * prev1, const void * cur1, const void * next1, int prefs,
++                                          int w, int parity, int clip_max);
++
++                randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                randomize_buffers(next0, next1, mask, 11*WIDTH);
++                randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++
++                call_ref(dst0, stride,
++                         prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                         WIDTH, parity, mask);
++                call_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++
++                if (memcmp(dst0, dst1, WIDTH*3)
++                        || memcmp(prev0, prev1, WIDTH*11)
++                        || memcmp(next0, next1, WIDTH*11)
++                        || memcmp( cur0,  cur1, WIDTH*11))
++                    fail();
++
++                bench_new(dst1, stride,
++                         prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                         WIDTH, parity, mask);
++            }
++        }
++
++        // Use just 0s and ~0s to try to provoke bad cropping or overflow
++        // Parity makes no difference to this test so just test 0
++        if (check_func(ctx_8.filter_line3, "bwdif8.line3.overflow")) {
++
++            declare_func(void, void * dst1, int d_stride,
++                                      const void * prev1, const void * cur1, const void * next1, int prefs,
++                                      int w, int parity, int clip_max);
++
++            randomize_overflow_check(prev0, prev1, mask, 11*WIDTH);
++            randomize_overflow_check(next0, next1, mask, 11*WIDTH);
++            randomize_overflow_check( cur0,  cur1, mask, 11*WIDTH);
++
++            call_ref(dst0, stride,
++                     prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, stride,
++                     WIDTH, 0, mask);
++            call_new(dst1, stride,
++                     prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, stride,
++                     WIDTH, 0, mask);
++
++            if (memcmp(dst0, dst1, WIDTH*3)
++                    || memcmp(prev0, prev1, WIDTH*11)
++                    || memcmp(next0, next1, WIDTH*11)
++                    || memcmp( cur0,  cur1, WIDTH*11))
++                fail();
++
++            // No point to benching
++        }
++
++        report("bwdif8.line3");
++    }
++
++    {
++        LOCAL_ALIGNED_16(uint8_t, prev0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, prev1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next0, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, next1, [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++        int spat;
++        int parity;
++
++        for (spat = 0; spat != 2; ++spat) {
++            for (parity = 0; parity != 2; ++parity) {
++                if (check_func(ctx_8.filter_edge, "bwdif8.edge.s%d.p%d", spat, parity)) {
++
++                    declare_func(void, void *dst1, void *prev1, void *cur1, void *next1,
++                                            int w, int prefs, int mrefs, int prefs2, int mrefs2,
++                                            int parity, int clip_max, int spat);
++
++                    randomize_buffers(prev0, prev1, mask, 11*WIDTH);
++                    randomize_buffers(next0, next1, mask, 11*WIDTH);
++                    randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++                    memset(dst0, 0xba, WIDTH * 3);
++                    memset(dst1, 0xba, WIDTH * 3);
++
++                    call_ref(dst0 + stride,
++                             prev0 + stride * 4, cur0 + stride * 4, next0 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                    call_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++
++                    if (memcmp(dst0, dst1, WIDTH*3)
++                            || memcmp(prev0, prev1, WIDTH*11)
++                            || memcmp(next0, next1, WIDTH*11)
++                            || memcmp( cur0,  cur1, WIDTH*11))
++                        fail();
++
++                    bench_new(dst1 + stride,
++                             prev1 + stride * 4, cur1 + stride * 4, next1 + stride * 4, WIDTH,
++                             stride, -stride, stride * 2, -stride * 2,
++                             parity, mask, spat);
++                }
++            }
++        }
++
++        report("bwdif8.edge");
++    }
++
++    if (check_func(ctx_8.filter_intra, "bwdif8.intra")) {
++        LOCAL_ALIGNED_16(uint8_t, cur0,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, cur1,  [11*WIDTH]);
++        LOCAL_ALIGNED_16(uint8_t, dst0,  [WIDTH*3]);
++        LOCAL_ALIGNED_16(uint8_t, dst1,  [WIDTH*3]);
++        const int stride = WIDTH;
++        const int mask = (1<<8)-1;
++
++        declare_func(void, void *dst1, void *cur1, int w, int prefs, int mrefs,
++                     int prefs3, int mrefs3, int parity, int clip_max);
++
++        randomize_buffers( cur0,  cur1, mask, 11*WIDTH);
++        memset(dst0, 0xba, WIDTH * 3);
++        memset(dst1, 0xba, WIDTH * 3);
++
++        call_ref(dst0 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++        call_new(dst1 + stride,
++                 cur0 + stride * 4, WIDTH,
++                 stride, -stride, stride * 3, -stride * 3,
++                 0, mask);
++
++        if (memcmp(dst0, dst1, WIDTH*3)
++                || memcmp( cur0,  cur1, WIDTH*11))
++            fail();
++
++        bench_new(dst1 + stride,
++                  cur0 + stride * 4, WIDTH,
++                  stride, -stride, stride * 3, -stride * 3,
++                  0, mask);
++
++        report("bwdif8.intra");
++    }
++}
+diff --git a/tests/fate/checkasm.mak b/tests/fate/checkasm.mak
+index 07f1d8238e..723c2b26ef 100644
+--- a/tests/fate/checkasm.mak
++++ b/tests/fate/checkasm.mak
+@@ -16,18 +16,22 @@ FATE_CHECKASM = fate-checkasm-aacpsdsp                                  \
+                 fate-checkasm-hevc_add_res                              \
+                 fate-checkasm-hevc_idct                                 \
+                 fate-checkasm-hevc_sao                                  \
++                fate-checkasm-idctdsp                                   \
+                 fate-checkasm-jpeg2000dsp                               \
+                 fate-checkasm-llviddsp                                  \
+                 fate-checkasm-llviddspenc                               \
+                 fate-checkasm-opusdsp                                   \
+                 fate-checkasm-pixblockdsp                               \
++                fate-checkasm-rpi_sand                                  \
+                 fate-checkasm-sbrdsp                                    \
+                 fate-checkasm-synth_filter                              \
+                 fate-checkasm-sw_rgb                                    \
+                 fate-checkasm-sw_scale                                  \
+                 fate-checkasm-v210dec                                   \
+                 fate-checkasm-v210enc                                   \
++                fate-checkasm-vc1dsp                                    \
+                 fate-checkasm-vf_blend                                  \
++                fate-checkasm-vf_bwdif                                  \
+                 fate-checkasm-vf_colorspace                             \
+                 fate-checkasm-vf_eq                                     \
+                 fate-checkasm-vf_gblur                                  \
diff --git a/packages/network/samba/package.mk b/packages/network/samba/package.mk
index fe617a9a6a..1b4c5881c6 100644
--- a/packages/network/samba/package.mk
+++ b/packages/network/samba/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="samba"
-PKG_VERSION="4.17.10"
-PKG_SHA256="00dbb0aac9f4cfee800f708f4e9098964a43a7646e618230706d532c9bb6c350"
+PKG_VERSION="4.17.12"
+PKG_SHA256="6129a7f967b822b308c57152326e7302711c2d0b9830a82d0c21832b648741f4"
 PKG_LICENSE="GPLv3+"
 PKG_SITE="https://www.samba.org"
 PKG_URL="https://download.samba.org/pub/samba/stable/${PKG_NAME}-${PKG_VERSION}.tar.gz"
@@ -173,7 +173,6 @@ post_makeinstall_target() {
 
   mkdir -p ${INSTALL}/usr/lib/samba
     cp ${PKG_DIR}/scripts/samba-config ${INSTALL}/usr/lib/samba
-    cp ${PKG_DIR}/scripts/smbd-config ${INSTALL}/usr/lib/samba
     cp ${PKG_DIR}/scripts/samba-autoshare ${INSTALL}/usr/lib/samba
 
   if find_file_path config/smb.conf; then
diff --git a/packages/network/samba/scripts/samba-config b/packages/network/samba/scripts/samba-config
index 927010695b..e3fd682823 100755
--- a/packages/network/samba/scripts/samba-config
+++ b/packages/network/samba/scripts/samba-config
@@ -2,40 +2,120 @@
 
 # SPDX-License-Identifier: GPL-2.0-or-later
 # Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
+# Copyright (C) 2020-present Team LibreELEC (https://libreelec.tv)
 
 SMB_USERCONF="/storage/.config/samba.conf"
 SMB_DEFCONF="/etc/samba/smb.conf"
 SMB_CONF="/run/samba/smb.conf"
 
+SMB_DIR=$(dirname ${SMB_CONF})
+mkdir -p ${SMB_DIR}
+
+# exclusive access
+SMB_LOCK="${SMB_DIR}/samba-config.lock"
+exec 200>"${SMB_LOCK}"
+flock 200
+
+SMB_TMP=$(mktemp -p ${SMB_DIR})
+
 SMB_USERCONF_IS_VALID=no
 SMB_CONFIG_VERSION=4
 
 # If user config is based on legacy OpenELEC, or old version (or no version)
 # then don't use it, and log a warning.
-if [ -f $SMB_USERCONF ]; then
-  SMB_IS_LEGACY="$(awk 'NR <= 2 && /This file is part of OpenELEC/{ print }' $SMB_USERCONF)"
-  SMB_THIS_VER="$(awk '/^# samba.conf v[0-9\.]*/{ print substr($3,2); exit }' $SMB_USERCONF)"
+if [ -f ${SMB_USERCONF} ]; then
+  SMB_IS_LEGACY="$(awk 'NR <= 2 && /This file is part of OpenELEC/{ print }' ${SMB_USERCONF})"
+  SMB_THIS_VER="$(awk '/^# samba.conf v[0-9\.]*/{ print substr($3,2); exit }' ${SMB_USERCONF})"
   if [ -n "${SMB_IS_LEGACY}" ]; then
-    echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [Old style OpenELEC]"
+    echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [Old style OpenELEC]"
   elif [ -z "${SMB_THIS_VER}" ]; then
-    echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [version is unknown or invalid]"
+    echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [version is unknown or invalid]"
   elif [ ${SMB_THIS_VER} !=  ${SMB_CONFIG_VERSION} ]; then
-    echo "WARNING: Ignoring user config $SMB_USERCONF due to incompatibility [version ${SMB_THIS_VER} is not the required version $SMB_CONFIG_VERSION]"
+    echo "WARNING: Ignoring user config ${SMB_USERCONF} due to incompatibility [version ${SMB_THIS_VER} is not the required version ${SMB_CONFIG_VERSION}]"
   else
     SMB_USERCONF_IS_VALID=yes
   fi
 fi
 
-mkdir -p $(dirname $SMB_CONF)
-  if [ $SMB_USERCONF_IS_VALID = yes ]; then
-    cp $SMB_USERCONF $SMB_CONF
-  else
-    cp $SMB_DEFCONF $SMB_CONF
-  fi
-
-# Generate smb.conf, unless disabled
-if [ ! -f /storage/.cache/services/samba.disabled ]; then
-  /usr/lib/samba/smbd-config
+if [ ${SMB_USERCONF_IS_VALID} = yes ]; then
+  cp ${SMB_USERCONF} ${SMB_TMP}
+else
+  cp ${SMB_DEFCONF} ${SMB_TMP}
 fi
 
+echo >>${SMB_TMP}
+
+if [ ! -f /storage/.cache/services/samba.disabled ]; then
+
+  ### Generate smb.conf
+
+  if [ ! -f /storage/.cache/services/samba.conf ]; then
+    cp /usr/share/services/samba.conf /storage/.cache/services
+  fi
+
+  # Specify defaults here, in case these new properties not yet added in .cache
+  SAMBA_WORKGROUP=WORKGROUP
+  SAMBA_MINPROTOCOL=SMB2
+  SAMBA_MAXPROTOCOL=SMB3
+
+  . /storage/.cache/services/samba.conf
+
+  # fixup synonyms
+  sed -i 's/browsable/browseable/g; s/writable/writeable/g' ${SMB_TMP}
+
+  # handle external drives
+  if [ "${SAMBA_AUTOSHARE}" = "true" ] ; then
+    for dir in /media/* ; do
+      if [ -d "$dir" ] ; then
+        name=$(basename "$dir")
+        echo -e "[$name]\n  path = $dir\n  available = yes\n  browseable = yes\n  public = yes\n  writeable = yes\n" >> ${SMB_TMP}
+      fi
+    done
+  fi
+
+  # Allow access to a "failed" (safe mode) Kodi installation
+  if [ -d /storage/.kodi.FAILED ]; then
+    echo -e "[Kodi-Failed]\n  path = /storage/.kodi.FAILED\n  available = yes\n  browseable = yes\n  public = yes\n  writeable = yes\n" >> ${SMB_TMP}
+  fi
+
+  ADD_CONFIG=
+
+  # If workgroup is not set, don't set it - who knows, user may know better.
+  if [ -n "$SAMBA_WORKGROUP" ]; then
+    # Remove any existing workgroup setting
+    sed -E '/^[[:space:]]*workgroup[[:space:]]*=/d' -i ${SMB_TMP}
+    ADD_CONFIG="${ADD_CONFIG}  workgroup = ${SAMBA_WORKGROUP:-WORKGROUP}\n"
+  fi
+
+  ADD_CONFIG="${ADD_CONFIG}  server min protocol = ${SAMBA_MINPROTOCOL/SMB1/NT1}\n"
+  ADD_CONFIG="${ADD_CONFIG}  server max protocol = ${SAMBA_MAXPROTOCOL/SMB1/NT1}\n"
+
+  # Add extra config after [global], escaping spaces so that all are retained by sed
+  sed -e "/\[global\]/ a ${ADD_CONFIG// /\\ }" -i ${SMB_TMP}
+
+  if [ "${SAMBA_SECURE}" = "true" -a -n "${SAMBA_USERNAME}" -a -n "${SAMBA_PASSWORD}" ] ; then
+    # username map: first line makes sure plain root does not work all the time
+    # processing continues, so if user chooses root as username, second line overrides the first
+    # this is done always in case user uses passwords in userconf.
+    # many thanks to viljoviitanen for this
+    sed -e 's|^.[ \t]*.public.=.*|  public = no |' \
+        -e 's|^.[ \t]*.username map.=.*||' \
+        -e 's|^.[ \t]*.security.=.*|  security = user\n  username map = /run/samba/samba.map|' \
+        -e 's|^.[ \t]*.map.to.guest.=.*|  map to guest = Never|' \
+        -i ${SMB_TMP}
+
+    printf "%s\n%s" "${SAMBA_PASSWORD}" "${SAMBA_PASSWORD}" | smbpasswd -c ${SMB_TMP} -s -a root
+    printf 'nobody = root\nroot = "%s"\n' "${SAMBA_USERNAME}" > /run/samba/samba.map
+
+  else
+    sed -e 's|^.[ \t]*.public.=.*|  public = yes |' \
+        -e 's|^.[ \t]*.username map.=.*||' \
+        -e 's|^.[ \t]*.security.=.*|  security = user|' \
+        -e 's|^.[ \t]*.map.to.guest.=.*|  map to guest = Bad User|' \
+        -i ${SMB_TMP}
+  fi
+fi
+
+mv -f ${SMB_TMP} ${SMB_CONF}
+
 exit 0
diff --git a/packages/network/samba/scripts/smbd-config b/packages/network/samba/scripts/smbd-config
deleted file mode 100755
index aed4730a3a..0000000000
--- a/packages/network/samba/scripts/smbd-config
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/sh
-
-# SPDX-License-Identifier: GPL-2.0-or-later
-# Copyright (C) 2009-2017 Stephan Raue (stephan@openelec.tv)
-# Copyright (C) 2020-present Team LibreELEC (https://libreelec.tv)
-
-SMB_CONF="/run/samba/smb.conf"
-SMB_TMP="$(mktemp -p /run/samba)"
-
-cp -f $SMB_CONF $SMB_TMP
-
-if [ ! -f /storage/.cache/services/samba.conf ]; then
-  cp /usr/share/services/samba.conf /storage/.cache/services
-fi
-
-# Specify defaults here, in case these new properties not yet added in .cache
-SAMBA_WORKGROUP=WORKGROUP
-SAMBA_MINPROTOCOL=SMB2
-SAMBA_MAXPROTOCOL=SMB3
-
-. /storage/.cache/services/samba.conf
-
-# fixup synonyms
-sed -i 's/browsable/browseable/g; s/writable/writeable/g' $SMB_TMP
-
-# handle external drives
-if [ "$SAMBA_AUTOSHARE" == "true" ] ; then
-  for dir in /media/* ; do
-    if [ -d "$dir" ] ; then
-      name=$(basename "$dir")
-      echo -e "[$name]\n  path = $dir\n  available = yes\n  browseable = yes\n  public = yes\n  writeable = yes\n" >> $SMB_TMP
-    fi
-  done
-fi
-
-# Allow access to a "failed" (safe mode) Kodi installation
-if [ -d /storage/.kodi.FAILED ]; then
-  echo -e "[Kodi-Failed]\n  path = /storage/.kodi.FAILED\n  available = yes\n  browseable = yes\n  public = yes\n  writeable = yes\n" >> $SMB_TMP
-fi
-
-ADD_CONFIG=
-
-# If workgroup is not set, don't set it - who knows, user may know better.
-if [ -n "$SAMBA_WORKGROUP" ]; then
-  # Remove any existing workgroup setting
-  sed -E '/^[[:space:]]*workgroup[[:space:]]*=/d' -i $SMB_TMP
-  ADD_CONFIG="${ADD_CONFIG}  workgroup = ${SAMBA_WORKGROUP:-WORKGROUP}\n"
-fi
-
-ADD_CONFIG="${ADD_CONFIG}  server min protocol = ${SAMBA_MINPROTOCOL/SMB1/NT1}\n"
-ADD_CONFIG="${ADD_CONFIG}  server max protocol = ${SAMBA_MAXPROTOCOL/SMB1/NT1}\n"
-
-# Add extra config after [global], escaping spaces so that all are retained by sed
-sed -e "/\[global\]/ a ${ADD_CONFIG// /\\ }" -i $SMB_TMP
-
-if [ "$SAMBA_SECURE" == "true" -a ! "$SAMBA_USERNAME" == "" -a ! "$SAMBA_PASSWORD" == "" ] ; then
-  # username map: first line makes sure plain root does not work all the time
-  # processing continues, so if user chooses root as username, second line overrides the first
-  # this is done always in case user uses passwords in userconf.
-  # many thanks to viljoviitanen for this
-  printf "%s\n%s" "$SAMBA_PASSWORD" "$SAMBA_PASSWORD" | smbpasswd -s -a root >/dev/null 2>&1
-  printf "nobody = root\nroot = %s" "$SAMBA_USERNAME" > /run/samba/samba.map
-
-  sed -e 's|^.[ \t]*.public.=.*|  public = no |' \
-      -e 's|^.[ \t]*.username map.=.*||' \
-      -e 's|^.[ \t]*.security.=.*|  security = user\n  username map = /run/samba/samba.map|' \
-      -e 's|^.[ \t]*.map.to.guest.=.*|  map to guest = Never|' \
-      -i $SMB_TMP
-else
-  sed -e 's|^.[ \t]*.public.=.*|  public = yes |' \
-      -e 's|^.[ \t]*.username map.=.*||' \
-      -e 's|^.[ \t]*.security.=.*|  security = user|' \
-      -e 's|^.[ \t]*.map.to.guest.=.*|  map to guest = Bad User|' \
-      -i $SMB_TMP
-fi
-
-mv -f $SMB_TMP $SMB_CONF
diff --git a/packages/network/wireless-regdb/package.mk b/packages/network/wireless-regdb/package.mk
index fa3a7a9d62..27860afd03 100644
--- a/packages/network/wireless-regdb/package.mk
+++ b/packages/network/wireless-regdb/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2018-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="wireless-regdb"
-PKG_VERSION="2023.05.03"
-PKG_SHA256="f254d08ab3765aeae2b856222e11a95d44aef519a6663877c71ef68fae4c8c12"
+PKG_VERSION="2023.09.01"
+PKG_SHA256="26d4c2a727cc59239b84735aad856b7c7d0b04e30aa5c235c4f7f47f5f053491"
 PKG_LICENSE="GPL"
 PKG_SITE="https://wireless.wiki.kernel.org/en/developers/regulatory/wireless-regdb"
 PKG_URL="https://www.kernel.org/pub/software/network/${PKG_NAME}/${PKG_NAME}-${PKG_VERSION}.tar.xz"
diff --git a/packages/security/openssl/cert/cacert.pem b/packages/security/openssl/cert/cacert.pem
index ab8c20fa53..a1617cb833 100644
--- a/packages/security/openssl/cert/cacert.pem
+++ b/packages/security/openssl/cert/cacert.pem
@@ -1,7 +1,7 @@
 ##
 ## Bundle of CA Root Certificates
 ##
-## Certificate data from Mozilla as of: Fri Jul 21 14:36:19 2023 GMT
+## Certificate data from Mozilla as of: Sat Nov 18 22:59:13 2023 GMT
 ##
 ## This is a bundle of X.509 certificates of public Certificate Authorities
 ## (CA). These were automatically extracted from Mozilla's root certificates
@@ -14,7 +14,7 @@
 ## Just configure this file as the SSLCACertificateFile.
 ##
 ## Conversion done with mk-ca-bundle.pl version 1.29.
-## SHA256: 0ff137babc6a5561a9cfbe9f29558972e5b528202681b7d3803d03a3e82922bd
+## SHA256: 1970dd65858925d68498d2356aea6d03f764422523c5887deca8ce3ba9e1f845
 ##
 
 
@@ -200,27 +200,6 @@ vGJHvOB0K7Lrfb5BG7XARsWhIstfTsEokt4YutUqKLsRixeTmJlglFwjz1onl14LBQaTNx47aTbr
 qZ5hHY8y2o4M1nQ+ewkk2gF3R8Q7zTSMmfXK4SVhM7JZG+Ju1zdXtg2pEto=
 -----END CERTIFICATE-----
 
-Security Communication Root CA
-==============================
------BEGIN CERTIFICATE-----
-MIIDWjCCAkKgAwIBAgIBADANBgkqhkiG9w0BAQUFADBQMQswCQYDVQQGEwJKUDEYMBYGA1UEChMP
-U0VDT00gVHJ1c3QubmV0MScwJQYDVQQLEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTEw
-HhcNMDMwOTMwMDQyMDQ5WhcNMjMwOTMwMDQyMDQ5WjBQMQswCQYDVQQGEwJKUDEYMBYGA1UEChMP
-U0VDT00gVHJ1c3QubmV0MScwJQYDVQQLEx5TZWN1cml0eSBDb21tdW5pY2F0aW9uIFJvb3RDQTEw
-ggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQCzs/5/022x7xZ8V6UMbXaKL0u/ZPtM7orw
-8yl89f/uKuDp6bpbZCKamm8sOiZpUQWZJtzVHGpxxpp9Hp3dfGzGjGdnSj74cbAZJ6kJDKaVv0uM
-DPpVmDvY6CKhS3E4eayXkmmziX7qIWgGmBSWh9JhNrxtJ1aeV+7AwFb9Ms+k2Y7CI9eNqPPYJayX
-5HA49LY6tJ07lyZDo6G8SVlyTCMwhwFY9k6+HGhWZq/NQV3Is00qVUarH9oe4kA92819uZKAnDfd
-DJZkndwi92SL32HeFZRSFaB9UslLqCHJxrHty8OVYNEP8Ktw+N/LTX7s1vqr2b1/VPKl6Xn62dZ2
-JChzAgMBAAGjPzA9MB0GA1UdDgQWBBSgc0mZaNyFW2XjmygvV5+9M7wHSDALBgNVHQ8EBAMCAQYw
-DwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQUFAAOCAQEAaECpqLvkT115swW1F7NgE+vGkl3g
-0dNq/vu+m22/xwVtWSDEHPC32oRYAmP6SBbvT6UL90qY8j+eG61Ha2POCEfrUj94nK9NrvjVT8+a
-mCoQQTlSxN3Zmw7vkwGusi7KaEIkQmywszo+zenaSMQVy+n5Bw+SUEmK3TGXX8npN6o7WWWXlDLJ
-s58+OmJYxUmtYg5xpTKqL8aJdkNAExNnPaJUJRDL8Try2frbSVa7pv6nQTXD4IhhyYjH3zYQIphZ
-6rBK+1YWc26sTfcioU+tHXotRSflMMFe8toTyyVCUZVHA4xsIcx0Qu1T/zOLjw9XARYvz6buyXAi
-FL39vmwLAw==
------END CERTIFICATE-----
-
 XRamp Global CA Root
 ====================
 -----BEGIN CERTIFICATE-----
@@ -669,39 +648,6 @@ YIvDQVETI53O9zJrlAGomecsMx86OyXShkDOOyyGeMlhLxS67ttVb9+E7gUJTb0o2HLO02JQZR7r
 kpeDMdmztcpHWD9f
 -----END CERTIFICATE-----
 
-Autoridad de Certificacion Firmaprofesional CIF A62634068
-=========================================================
------BEGIN CERTIFICATE-----
-MIIGFDCCA/ygAwIBAgIIU+w77vuySF8wDQYJKoZIhvcNAQEFBQAwUTELMAkGA1UEBhMCRVMxQjBA
-BgNVBAMMOUF1dG9yaWRhZCBkZSBDZXJ0aWZpY2FjaW9uIEZpcm1hcHJvZmVzaW9uYWwgQ0lGIEE2
-MjYzNDA2ODAeFw0wOTA1MjAwODM4MTVaFw0zMDEyMzEwODM4MTVaMFExCzAJBgNVBAYTAkVTMUIw
-QAYDVQQDDDlBdXRvcmlkYWQgZGUgQ2VydGlmaWNhY2lvbiBGaXJtYXByb2Zlc2lvbmFsIENJRiBB
-NjI2MzQwNjgwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDKlmuO6vj78aI14H9M2uDD
-Utd9thDIAl6zQyrET2qyyhxdKJp4ERppWVevtSBC5IsP5t9bpgOSL/UR5GLXMnE42QQMcas9UX4P
-B99jBVzpv5RvwSmCwLTaUbDBPLutN0pcyvFLNg4kq7/DhHf9qFD0sefGL9ItWY16Ck6WaVICqjaY
-7Pz6FIMMNx/Jkjd/14Et5cS54D40/mf0PmbR0/RAz15iNA9wBj4gGFrO93IbJWyTdBSTo3OxDqqH
-ECNZXyAFGUftaI6SEspd/NYrspI8IM/hX68gvqB2f3bl7BqGYTM+53u0P6APjqK5am+5hyZvQWyI
-plD9amML9ZMWGxmPsu2bm8mQ9QEM3xk9Dz44I8kvjwzRAv4bVdZO0I08r0+k8/6vKtMFnXkIoctX
-MbScyJCyZ/QYFpM6/EfY0XiWMR+6KwxfXZmtY4laJCB22N/9q06mIqqdXuYnin1oKaPnirjaEbsX
-LZmdEyRG98Xi2J+Of8ePdG1asuhy9azuJBCtLxTa/y2aRnFHvkLfuwHb9H/TKI8xWVvTyQKmtFLK
-bpf7Q8UIJm+K9Lv9nyiqDdVF8xM6HdjAeI9BZzwelGSuewvF6NkBiDkal4ZkQdU7hwxu+g/GvUgU
-vzlN1J5Bto+WHWOWk9mVBngxaJ43BjuAiUVhOSPHG0SjFeUc+JIwuwIDAQABo4HvMIHsMBIGA1Ud
-EwEB/wQIMAYBAf8CAQEwDgYDVR0PAQH/BAQDAgEGMB0GA1UdDgQWBBRlzeurNR4APn7VdMActHNH
-DhpkLzCBpgYDVR0gBIGeMIGbMIGYBgRVHSAAMIGPMC8GCCsGAQUFBwIBFiNodHRwOi8vd3d3LmZp
-cm1hcHJvZmVzaW9uYWwuY29tL2NwczBcBggrBgEFBQcCAjBQHk4AUABhAHMAZQBvACAAZABlACAA
-bABhACAAQgBvAG4AYQBuAG8AdgBhACAANAA3ACAAQgBhAHIAYwBlAGwAbwBuAGEAIAAwADgAMAAx
-ADcwDQYJKoZIhvcNAQEFBQADggIBABd9oPm03cXF661LJLWhAqvdpYhKsg9VSytXjDvlMd3+xDLx
-51tkljYyGOylMnfX40S2wBEqgLk9am58m9Ot/MPWo+ZkKXzR4Tgegiv/J2Wv+xYVxC5xhOW1//qk
-R71kMrv2JYSiJ0L1ILDCExARzRAVukKQKtJE4ZYm6zFIEv0q2skGz3QeqUvVhyj5eTSSPi5E6PaP
-T481PyWzOdxjKpBrIF/EUhJOlywqrJ2X3kjyo2bbwtKDlaZmp54lD+kLM5FlClrD2VQS3a/DTg4f
-Jl4N3LON7NWBcN7STyQF82xO9UxJZo3R/9ILJUFI/lGExkKvgATP0H5kSeTy36LssUzAKh3ntLFl
-osS88Zj0qnAHY7S42jtM+kAiMFsRpvAFDsYCA0irhpuF3dvd6qJ2gHN99ZwExEWN57kci57q13XR
-crHedUTnQn3iV2t93Jm8PYMo6oCTjcVMZcFwgbg4/EMxsvYDNEeyrPsiBsse3RdHHF9mudMaotoR
-saS8I8nkvof/uZS2+F0gStRf571oe2XyFR7SOqkt6dhrJKyXWERHrVkY8SFlcN7ONGCoQPHzPKTD
-KCOM/iczQ0CgFzzr6juwcqajuUpLXhZI9LK8yIySxZ2frHI2vDSANGupi5LAuBft7HZT9SQBjLMi
-6Et8Vcad+qMUu2WFbm5PEn4KPJ2V
------END CERTIFICATE-----
-
 Izenpe.com
 ==========
 -----BEGIN CERTIFICATE-----
@@ -3449,3 +3395,140 @@ TFsR0PXNor6uzFFcw9VUewyu1rkGd4Di7wcaaMxZUa1+XGdrudviB0JbuAEFWDlN5LuYo7Ey7Nmj
 PqYO5Wue/9vsL3SD3460s6neFE3/MaNFcyT6lSnMEpcEoji2jbDwN/zIIX8/syQbPYtuzE2wFg2W
 HYMfRsCbvUOZ58SWLs5fyQ==
 -----END CERTIFICATE-----
+
+TrustAsia Global Root CA G3
+===========================
+-----BEGIN CERTIFICATE-----
+MIIFpTCCA42gAwIBAgIUZPYOZXdhaqs7tOqFhLuxibhxkw8wDQYJKoZIhvcNAQEMBQAwWjELMAkG
+A1UEBhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dpZXMsIEluYy4xJDAiBgNVBAMM
+G1RydXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHMzAeFw0yMTA1MjAwMjEwMTlaFw00NjA1MTkwMjEw
+MTlaMFoxCzAJBgNVBAYTAkNOMSUwIwYDVQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMu
+MSQwIgYDVQQDDBtUcnVzdEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzMwggIiMA0GCSqGSIb3DQEBAQUA
+A4ICDwAwggIKAoICAQDAMYJhkuSUGwoqZdC+BqmHO1ES6nBBruL7dOoKjbmzTNyPtxNST1QY4Sxz
+lZHFZjtqz6xjbYdT8PfxObegQ2OwxANdV6nnRM7EoYNl9lA+sX4WuDqKAtCWHwDNBSHvBm3dIZwZ
+Q0WhxeiAysKtQGIXBsaqvPPW5vxQfmZCHzyLpnl5hkA1nyDvP+uLRx+PjsXUjrYsyUQE49RDdT/V
+P68czH5GX6zfZBCK70bwkPAPLfSIC7Epqq+FqklYqL9joDiR5rPmd2jE+SoZhLsO4fWvieylL1Ag
+dB4SQXMeJNnKziyhWTXAyB1GJ2Faj/lN03J5Zh6fFZAhLf3ti1ZwA0pJPn9pMRJpxx5cynoTi+jm
+9WAPzJMshH/x/Gr8m0ed262IPfN2dTPXS6TIi/n1Q1hPy8gDVI+lhXgEGvNz8teHHUGf59gXzhqc
+D0r83ERoVGjiQTz+LISGNzzNPy+i2+f3VANfWdP3kXjHi3dqFuVJhZBFcnAvkV34PmVACxmZySYg
+WmjBNb9Pp1Hx2BErW+Canig7CjoKH8GB5S7wprlppYiU5msTf9FkPz2ccEblooV7WIQn3MSAPmea
+mseaMQ4w7OYXQJXZRe0Blqq/DPNL0WP3E1jAuPP6Z92bfW1K/zJMtSU7/xxnD4UiWQWRkUF3gdCF
+TIcQcf+eQxuulXUtgQIDAQABo2MwYTAPBgNVHRMBAf8EBTADAQH/MB8GA1UdIwQYMBaAFEDk5PIj
+7zjKsK5Xf/IhMBY027ySMB0GA1UdDgQWBBRA5OTyI+84yrCuV3/yITAWNNu8kjAOBgNVHQ8BAf8E
+BAMCAQYwDQYJKoZIhvcNAQEMBQADggIBACY7UeFNOPMyGLS0XuFlXsSUT9SnYaP4wM8zAQLpw6o1
+D/GUE3d3NZ4tVlFEbuHGLige/9rsR82XRBf34EzC4Xx8MnpmyFq2XFNFV1pF1AWZLy4jVe5jaN/T
+G3inEpQGAHUNcoTpLrxaatXeL1nHo+zSh2bbt1S1JKv0Q3jbSwTEb93mPmY+KfJLaHEih6D4sTNj
+duMNhXJEIlU/HHzp/LgV6FL6qj6jITk1dImmasI5+njPtqzn59ZW/yOSLlALqbUHM/Q4X6RJpstl
+cHboCoWASzY9M/eVVHUl2qzEc4Jl6VL1XP04lQJqaTDFHApXB64ipCz5xUG3uOyfT0gA+QEEVcys
++TIxxHWVBqB/0Y0n3bOppHKH/lmLmnp0Ft0WpWIp6zqW3IunaFnT63eROfjXy9mPX1onAX1daBli
+2MjN9LdyR75bl87yraKZk62Uy5P2EgmVtqvXO9A/EcswFi55gORngS1d7XB4tmBZrOFdRWOPyN9y
+aFvqHbgB8X7754qz41SgOAngPN5C8sLtLpvzHzW2NtjjgKGLzZlkD8Kqq7HK9W+eQ42EVJmzbsAS
+ZthwEPEGNTNDqJwuuhQxzhB/HIbjj9LV+Hfsm6vxL2PZQl/gZ4FkkfGXL/xuJvYz+NO1+MRiqzFR
+JQJ6+N1rZdVtTTDIZbpoFGWsJwt0ivKH
+-----END CERTIFICATE-----
+
+TrustAsia Global Root CA G4
+===========================
+-----BEGIN CERTIFICATE-----
+MIICVTCCAdygAwIBAgIUTyNkuI6XY57GU4HBdk7LKnQV1tcwCgYIKoZIzj0EAwMwWjELMAkGA1UE
+BhMCQ04xJTAjBgNVBAoMHFRydXN0QXNpYSBUZWNobm9sb2dpZXMsIEluYy4xJDAiBgNVBAMMG1Ry
+dXN0QXNpYSBHbG9iYWwgUm9vdCBDQSBHNDAeFw0yMTA1MjAwMjEwMjJaFw00NjA1MTkwMjEwMjJa
+MFoxCzAJBgNVBAYTAkNOMSUwIwYDVQQKDBxUcnVzdEFzaWEgVGVjaG5vbG9naWVzLCBJbmMuMSQw
+IgYDVQQDDBtUcnVzdEFzaWEgR2xvYmFsIFJvb3QgQ0EgRzQwdjAQBgcqhkjOPQIBBgUrgQQAIgNi
+AATxs8045CVD5d4ZCbuBeaIVXxVjAd7Cq92zphtnS4CDr5nLrBfbK5bKfFJV4hrhPVbwLxYI+hW8
+m7tH5j/uqOFMjPXTNvk4XatwmkcN4oFBButJ+bAp3TPsUKV/eSm4IJijYzBhMA8GA1UdEwEB/wQF
+MAMBAf8wHwYDVR0jBBgwFoAUpbtKl86zK3+kMd6Xg1mDpm9xy94wHQYDVR0OBBYEFKW7SpfOsyt/
+pDHel4NZg6ZvccveMA4GA1UdDwEB/wQEAwIBBjAKBggqhkjOPQQDAwNnADBkAjBe8usGzEkxn0AA
+bbd+NvBNEU/zy4k6LHiRUKNbwMp1JvK/kF0LgoxgKJ/GcJpo5PECMFxYDlZ2z1jD1xCMuo6u47xk
+dUfFVZDj/bpV6wfEU6s3qe4hsiFbYI89MvHVI5TWWA==
+-----END CERTIFICATE-----
+
+CommScope Public Trust ECC Root-01
+==================================
+-----BEGIN CERTIFICATE-----
+MIICHTCCAaOgAwIBAgIUQ3CCd89NXTTxyq4yLzf39H91oJ4wCgYIKoZIzj0EAwMwTjELMAkGA1UE
+BhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBUcnVz
+dCBFQ0MgUm9vdC0wMTAeFw0yMTA0MjgxNzM1NDNaFw00NjA0MjgxNzM1NDJaME4xCzAJBgNVBAYT
+AlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3Qg
+RUNDIFJvb3QtMDEwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAARLNumuV16ocNfQj3Rid8NeeqrltqLx
+eP0CflfdkXmcbLlSiFS8LwS+uM32ENEp7LXQoMPwiXAZu1FlxUOcw5tjnSCDPgYLpkJEhRGnSjot
+6dZoL0hOUysHP029uax3OVejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMB0G
+A1UdDgQWBBSOB2LAUN3GGQYARnQE9/OufXVNMDAKBggqhkjOPQQDAwNoADBlAjEAnDPfQeMjqEI2
+Jpc1XHvr20v4qotzVRVcrHgpD7oh2MSg2NED3W3ROT3Ek2DS43KyAjB8xX6I01D1HiXo+k515liW
+pDVfG2XqYZpwI7UNo5uSUm9poIyNStDuiw7LR47QjRE=
+-----END CERTIFICATE-----
+
+CommScope Public Trust ECC Root-02
+==================================
+-----BEGIN CERTIFICATE-----
+MIICHDCCAaOgAwIBAgIUKP2ZYEFHpgE6yhR7H+/5aAiDXX0wCgYIKoZIzj0EAwMwTjELMAkGA1UE
+BhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBUcnVz
+dCBFQ0MgUm9vdC0wMjAeFw0yMTA0MjgxNzQ0NTRaFw00NjA0MjgxNzQ0NTNaME4xCzAJBgNVBAYT
+AlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1c3Qg
+RUNDIFJvb3QtMDIwdjAQBgcqhkjOPQIBBgUrgQQAIgNiAAR4MIHoYx7l63FRD/cHB8o5mXxO1Q/M
+MDALj2aTPs+9xYa9+bG3tD60B8jzljHz7aRP+KNOjSkVWLjVb3/ubCK1sK9IRQq9qEmUv4RDsNuE
+SgMjGWdqb8FuvAY5N9GIIvejQjBAMA8GA1UdEwEB/wQFMAMBAf8wDgYDVR0PAQH/BAQDAgEGMB0G
+A1UdDgQWBBTmGHX/72DehKT1RsfeSlXjMjZ59TAKBggqhkjOPQQDAwNnADBkAjAmc0l6tqvmSfR9
+Uj/UQQSugEODZXW5hYA4O9Zv5JOGq4/nich/m35rChJVYaoR4HkCMHfoMXGsPHED1oQmHhS48zs7
+3u1Z/GtMMH9ZzkXpc2AVmkzw5l4lIhVtwodZ0LKOag==
+-----END CERTIFICATE-----
+
+CommScope Public Trust RSA Root-01
+==================================
+-----BEGIN CERTIFICATE-----
+MIIFbDCCA1SgAwIBAgIUPgNJgXUWdDGOTKvVxZAplsU5EN0wDQYJKoZIhvcNAQELBQAwTjELMAkG
+A1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBU
+cnVzdCBSU0EgUm9vdC0wMTAeFw0yMTA0MjgxNjQ1NTRaFw00NjA0MjgxNjQ1NTNaME4xCzAJBgNV
+BAYTAlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1
+c3QgUlNBIFJvb3QtMDEwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQCwSGWjDR1C45Ft
+nYSkYZYSwu3D2iM0GXb26v1VWvZVAVMP8syMl0+5UMuzAURWlv2bKOx7dAvnQmtVzslhsuitQDy6
+uUEKBU8bJoWPQ7VAtYXR1HHcg0Hz9kXHgKKEUJdGzqAMxGBWBB0HW0alDrJLpA6lfO741GIDuZNq
+ihS4cPgugkY4Iw50x2tBt9Apo52AsH53k2NC+zSDO3OjWiE260f6GBfZumbCk6SP/F2krfxQapWs
+vCQz0b2If4b19bJzKo98rwjyGpg/qYFlP8GMicWWMJoKz/TUyDTtnS+8jTiGU+6Xn6myY5QXjQ/c
+Zip8UlF1y5mO6D1cv547KI2DAg+pn3LiLCuz3GaXAEDQpFSOm117RTYm1nJD68/A6g3czhLmfTif
+BSeolz7pUcZsBSjBAg/pGG3svZwG1KdJ9FQFa2ww8esD1eo9anbCyxooSU1/ZOD6K9pzg4H/kQO9
+lLvkuI6cMmPNn7togbGEW682v3fuHX/3SZtS7NJ3Wn2RnU3COS3kuoL4b/JOHg9O5j9ZpSPcPYeo
+KFgo0fEbNttPxP/hjFtyjMcmAyejOQoBqsCyMWCDIqFPEgkBEa801M/XrmLTBQe0MXXgDW1XT2mH
++VepuhX2yFJtocucH+X8eKg1mp9BFM6ltM6UCBwJrVbl2rZJmkrqYxhTnCwuwwIDAQABo0IwQDAP
+BgNVHRMBAf8EBTADAQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUN12mmnQywsL5x6YVEFm4
+5P3luG0wDQYJKoZIhvcNAQELBQADggIBAK+nz97/4L1CjU3lIpbfaOp9TSp90K09FlxD533Ahuh6
+NWPxzIHIxgvoLlI1pKZJkGNRrDSsBTtXAOnTYtPZKdVUvhwQkZyybf5Z/Xn36lbQnmhUQo8mUuJM
+3y+Xpi/SB5io82BdS5pYV4jvguX6r2yBS5KPQJqTRlnLX3gWsWc+QgvfKNmwrZggvkN80V4aCRck
+jXtdlemrwWCrWxhkgPut4AZ9HcpZuPN4KWfGVh2vtrV0KnahP/t1MJ+UXjulYPPLXAziDslg+Mkf
+Foom3ecnf+slpoq9uC02EJqxWE2aaE9gVOX2RhOOiKy8IUISrcZKiX2bwdgt6ZYD9KJ0DLwAHb/W
+NyVntHKLr4W96ioDj8z7PEQkguIBpQtZtjSNMgsSDesnwv1B10A8ckYpwIzqug/xBpMu95yo9GA+
+o/E4Xo4TwbM6l4c/ksp4qRyv0LAbJh6+cOx69TOY6lz/KwsETkPdY34Op054A5U+1C0wlREQKC6/
+oAI+/15Z0wUOlV9TRe9rh9VIzRamloPh37MG88EU26fsHItdkJANclHnYfkUyq+Dj7+vsQpZXdxc
+1+SWrVtgHdqul7I52Qb1dgAT+GhMIbA1xNxVssnBQVocicCMb3SgazNNtQEo/a2tiRc7ppqEvOuM
+6sRxJKi6KfkIsidWNTJf6jn7MZrVGczw
+-----END CERTIFICATE-----
+
+CommScope Public Trust RSA Root-02
+==================================
+-----BEGIN CERTIFICATE-----
+MIIFbDCCA1SgAwIBAgIUVBa/O345lXGN0aoApYYNK496BU4wDQYJKoZIhvcNAQELBQAwTjELMAkG
+A1UEBhMCVVMxEjAQBgNVBAoMCUNvbW1TY29wZTErMCkGA1UEAwwiQ29tbVNjb3BlIFB1YmxpYyBU
+cnVzdCBSU0EgUm9vdC0wMjAeFw0yMTA0MjgxNzE2NDNaFw00NjA0MjgxNzE2NDJaME4xCzAJBgNV
+BAYTAlVTMRIwEAYDVQQKDAlDb21tU2NvcGUxKzApBgNVBAMMIkNvbW1TY29wZSBQdWJsaWMgVHJ1
+c3QgUlNBIFJvb3QtMDIwggIiMA0GCSqGSIb3DQEBAQUAA4ICDwAwggIKAoICAQDh+g77aAASyE3V
+rCLENQE7xVTlWXZjpX/rwcRqmL0yjReA61260WI9JSMZNRTpf4mnG2I81lDnNJUDMrG0kyI9p+Kx
+7eZ7Ti6Hmw0zdQreqjXnfuU2mKKuJZ6VszKWpCtYHu8//mI0SFHRtI1CrWDaSWqVcN3SAOLMV2MC
+e5bdSZdbkk6V0/nLKR8YSvgBKtJjCW4k6YnS5cciTNxzhkcAqg2Ijq6FfUrpuzNPDlJwnZXjfG2W
+Wy09X6GDRl224yW4fKcZgBzqZUPckXk2LHR88mcGyYnJ27/aaL8j7dxrrSiDeS/sOKUNNwFnJ5rp
+M9kzXzehxfCrPfp4sOcsn/Y+n2Dg70jpkEUeBVF4GiwSLFworA2iI540jwXmojPOEXcT1A6kHkIf
+hs1w/tkuFT0du7jyU1fbzMZ0KZwYszZ1OC4PVKH4kh+Jlk+71O6d6Ts2QrUKOyrUZHk2EOH5kQMr
+eyBUzQ0ZGshBMjTRsJnhkB4BQDa1t/qp5Xd1pCKBXbCL5CcSD1SIxtuFdOa3wNemKfrb3vOTlycE
+VS8KbzfFPROvCgCpLIscgSjX74Yxqa7ybrjKaixUR9gqiC6vwQcQeKwRoi9C8DfF8rhW3Q5iLc4t
+Vn5V8qdE9isy9COoR+jUKgF4z2rDN6ieZdIs5fq6M8EGRPbmz6UNp2YINIos8wIDAQABo0IwQDAP
+BgNVHRMBAf8EBTADAQH/MA4GA1UdDwEB/wQEAwIBBjAdBgNVHQ4EFgQUR9DnsSL/nSz12Vdgs7Gx
+cJXvYXowDQYJKoZIhvcNAQELBQADggIBAIZpsU0v6Z9PIpNojuQhmaPORVMbc0RTAIFhzTHjCLqB
+KCh6krm2qMhDnscTJk3C2OVVnJJdUNjCK9v+5qiXz1I6JMNlZFxHMaNlNRPDk7n3+VGXu6TwYofF
+1gbTl4MgqX67tiHCpQ2EAOHyJxCDut0DgdXdaMNmEMjRdrSzbymeAPnCKfWxkxlSaRosTKCL4BWa
+MS/TiJVZbuXEs1DIFAhKm4sTg7GkcrI7djNB3NyqpgdvHSQSn8h2vS/ZjvQs7rfSOBAkNlEv41xd
+gSGn2rtO/+YHqP65DSdsu3BaVXoT6fEqSWnHX4dXTEN5bTpl6TBcQe7rd6VzEojov32u5cSoHw2O
+HG1QAk8mGEPej1WFsQs3BWDJVTkSBKEqz3EWnzZRSb9wO55nnPt7eck5HHisd5FUmrh1CoFSl+Nm
+YWvtPjgelmFV4ZFUjO2MJB+ByRCac5krFk5yAD9UG/iNuovnFNa2RU9g7Jauwy8CTl2dlklyALKr
+dVwPaFsdZcJfMw8eD/A7hvWwTruc9+olBdytoptLFwG+Qt81IR2tq670v64fG9PiO/yzcnMcmyiQ
+iRM9HcEARwmWmjgb3bHPDcK0RPOWlc4yOo80nOAXx17Org3bhzjlP1v9mxnhMUF6cKojawHhRUzN
+lM47ni3niAIi9G7oyOzWPPO5std3eqx7
+-----END CERTIFICATE-----
diff --git a/packages/sysutils/systemd/package.mk b/packages/sysutils/systemd/package.mk
index 9af3ea927a..8edc857b0c 100644
--- a/packages/sysutils/systemd/package.mk
+++ b/packages/sysutils/systemd/package.mk
@@ -202,7 +202,11 @@ post_makeinstall_target() {
 
   # tune logind.conf
   sed -e "s,^.*HandleLidSwitch=.*$,HandleLidSwitch=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf
-  sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf
+  if [ "${DISPLAYSERVER}" = "no" ]; then
+    sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=poweroff,g" -i ${INSTALL}/etc/systemd/logind.conf
+  else
+    sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=ignore,g" -i ${INSTALL}/etc/systemd/logind.conf
+  fi
 
   if [ "${DISTRO}" = "Lakka" -a "${PROJECT}" = "RPi" ]; then
     sed -e "s,^.*HandlePowerKey=.*$,HandlePowerKey=poweroff,g" -i $INSTALL/etc/systemd/logind.conf
diff --git a/packages/tools/bcm2835-bootloader/files/update.sh b/packages/tools/bcm2835-bootloader/files/update.sh
index 1d07b7fe93..500d2079aa 100755
--- a/packages/tools/bcm2835-bootloader/files/update.sh
+++ b/packages/tools/bcm2835-bootloader/files/update.sh
@@ -17,9 +17,11 @@ mount -o remount,rw $BOOT_ROOT
 
 # update bootloader files
 cp -p $SYSTEM_ROOT/usr/share/bootloader/LICENCE* $BOOT_ROOT
-cp -p $SYSTEM_ROOT/usr/share/bootloader/bootcode.bin $BOOT_ROOT
-cp -p $SYSTEM_ROOT/usr/share/bootloader/fixup.dat $BOOT_ROOT
-cp -p $SYSTEM_ROOT/usr/share/bootloader/start.elf $BOOT_ROOT
+for f in bootcode.bin fixup.dat start.elf ; do
+  if [ -f "${SYSTEM_ROOT}/usr/share/bootloader/$f" ]; then
+    cp -p "${SYSTEM_ROOT}/usr/share/bootloader/$f" "${BOOT_ROOT}"
+  fi
+done
 
 rm -f $BOOT_ROOT/bcm283*.dtb # cleanup excess dtb's used by upstream kernels (ie. not LE)
 cp -p $SYSTEM_ROOT/usr/share/bootloader/*.dtb $BOOT_ROOT
diff --git a/packages/tools/bcm2835-bootloader/package.mk b/packages/tools/bcm2835-bootloader/package.mk
index b4d99f118b..b8dcb35443 100644
--- a/packages/tools/bcm2835-bootloader/package.mk
+++ b/packages/tools/bcm2835-bootloader/package.mk
@@ -3,8 +3,8 @@
 # Copyright (C) 2017-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="bcm2835-bootloader"
-PKG_VERSION="543692d23dff7075915bc9c7e34abb3fe28e1c46"
-PKG_SHA256="718389513a36ce7073ee26666dfbdfeb259a7e88beaac603c3b7d1f5bde067a2"
+PKG_VERSION="fdb9eafae4b83e553593937eae8e77b0193903c3"
+PKG_SHA256="ce45b07afce3279f9d31fe12008c5250de4da5491bd9ced2de2f2ebb563aea80"
 PKG_ARCH="arm aarch64"
 PKG_LICENSE="nonfree"
 PKG_SITE="http://www.broadcom.com"
@@ -16,14 +16,19 @@ PKG_TOOLCHAIN="manual"
 makeinstall_target() {
   mkdir -p ${INSTALL}/usr/share/bootloader
     cp -PRv LICENCE* ${INSTALL}/usr/share/bootloader
-    cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader
-    if [ "${DEVICE:0:4}" = "RPi4" ]; then
-      cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
-      cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf
-    else
-      cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
-      cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf
-    fi
+    case "${DEVICE}" in
+      RPi4)
+        cp -PRv fixup4x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
+        cp -PRv start4x.elf ${INSTALL}/usr/share/bootloader/start.elf
+        ;;
+      RPi5)
+        ;;
+      *)
+        cp -PRv bootcode.bin ${INSTALL}/usr/share/bootloader
+        cp -PRv fixup_x.dat ${INSTALL}/usr/share/bootloader/fixup.dat
+        cp -PRv start_x.elf ${INSTALL}/usr/share/bootloader/start.elf
+        ;;
+    esac
 
     find_file_path bootloader/update.sh ${PKG_DIR}/files/update.sh && cp -PRv ${FOUND_PATH} ${INSTALL}/usr/share/bootloader
     find_file_path bootloader/canupdate.sh && cp -PRv ${FOUND_PATH} ${INSTALL}/usr/share/bootloader
diff --git a/packages/tools/bcm2835-bootloader/release b/packages/tools/bcm2835-bootloader/release
index 7ec99fd3a2..efdb7322e7 100755
--- a/packages/tools/bcm2835-bootloader/release
+++ b/packages/tools/bcm2835-bootloader/release
@@ -6,9 +6,11 @@
 
 mkdir -p $RELEASE_DIR/3rdparty/bootloader
   cp -PR $INSTALL/usr/share/bootloader/LICENCE* $RELEASE_DIR/3rdparty/bootloader/
-  cp -PR $INSTALL/usr/share/bootloader/bootcode.bin $RELEASE_DIR/3rdparty/bootloader/
-  cp -PR $INSTALL/usr/share/bootloader/fixup.dat $RELEASE_DIR/3rdparty/bootloader/
-  cp -PR $INSTALL/usr/share/bootloader/start.elf $RELEASE_DIR/3rdparty/bootloader/
+  for f in bootcode.bin fixup.dat start.elf ; do
+    if [ -f "${INSTALL}/usr/share/bootloader/$f" ]; then
+      cp -PR "${INSTALL}/usr/share/bootloader/$f" "${RELEASE_DIR}/3rdparty/bootloader/"
+    fi
+  done
   cp -PR $INSTALL/usr/share/bootloader/*.dtb $RELEASE_DIR/3rdparty/bootloader/
   cp -PR $INSTALL/usr/share/bootloader/overlays $RELEASE_DIR/3rdparty/bootloader/
 
diff --git a/packages/tools/rpi-eeprom/package.mk b/packages/tools/rpi-eeprom/package.mk
index 3be9bc3156..42f320cec5 100644
--- a/packages/tools/rpi-eeprom/package.mk
+++ b/packages/tools/rpi-eeprom/package.mk
@@ -2,8 +2,8 @@
 # Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
 
 PKG_NAME="rpi-eeprom"
-PKG_VERSION="75d3a760469130cb537e5d8d504f892336abd62b"
-PKG_SHA256="a71573a80149b1c2c4b6d5ec1527ef011611c6883a0cf06c02961fe518384307"
+PKG_VERSION="6b14e84a2fb2e1f7220a404f65e7e0985f07c9e5"
+PKG_SHA256="3907711bb2ff78a0e9120709b72b04d6d010f93f79d525af0454d3d27a772aca"
 PKG_LICENSE="BSD-3/custom"
 PKG_SITE="https://github.com/raspberrypi/rpi-eeprom"
 PKG_URL="https://github.com/raspberrypi/rpi-eeprom/archive/${PKG_VERSION}.tar.gz"
@@ -12,14 +12,20 @@ PKG_LONGDESC="rpi-eeprom: firmware, config and scripts to update RPi4 SPI bootlo
 PKG_TOOLCHAIN="manual"
 
 makeinstall_target() {
-  DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader
+  
+  if [ "${DEVICE}" = "RPi4" ]; then
+    _variant="2711"
+  else
+    _variant="2712"
+  fi
+
+  DESTDIR=${INSTALL}/$(get_kernel_overlay_dir)/lib/firmware/raspberrypi/bootloader-${_variant}
 
   mkdir -p ${DESTDIR}
-    _dirs="critical stable"
-    [ "${LIBREELEC_VERSION}" = "devel" ] && _dirs+=" beta"
+    _dirs="default latest"
 
     for _maindir in ${_dirs}; do
-      for _dir in ${PKG_BUILD}/firmware/${_maindir} ${PKG_BUILD}/firmware/{_maindir}-*; do
+      for _dir in ${PKG_BUILD}/firmware-${_variant}/${_maindir} ${PKG_BUILD}/firmware-${_variant}/${_maindir}-*; do
         [ -d "${_dir}" ] || continue
 
         _basedir="$(basename "${_dir}")"
@@ -31,14 +37,17 @@ makeinstall_target() {
           PKG_FW_FILE="$(ls -1 /${_dir}/pieeprom-* 2>/dev/null | tail -1)"
           [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
 
-          # VIA USB3
-          PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)"
-          [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
+          if [ "${DEVICE}" = "RPi4" ]; then
+            # VIA USB3
+            PKG_FW_FILE="$(ls -1 ${_dir}/vl805-*.bin 2>/dev/null | tail -1)"
+            [ -n "${PKG_FW_FILE}" ] && cp -PRv "${PKG_FW_FILE}" ${DESTDIR}/${_basedir}
+          fi
       done
     done
 
-    # also copy default and latest symlinks
-    cp -Prv ${PKG_BUILD}/firmware/{default,latest} ${DESTDIR}
+    # also create legacy naming symlinks
+    ln -s default ${DESTDIR}/critical
+    ln -s latest ${DESTDIR}/stable
 
   mkdir -p ${INSTALL}/usr/bin
     cp -PRv ${PKG_DIR}/source/rpi-eeprom-update ${INSTALL}/usr/bin
diff --git a/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat b/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat
index 4349b287e4..2fbd2899ab 100644
--- a/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat
+++ b/projects/Allwinner/firmwares/brcmfmac_sdio-firmware.dat
@@ -2,3 +2,4 @@
 BCM43430A1.def
 BCM43430A1.vim
 *.txt
+brcmfmac43456-sdio.bin
diff --git a/projects/Allwinner/linux/linux.aarch64.conf b/projects/Allwinner/linux/linux.aarch64.conf
index 1323a014f5..5994eedeb3 100644
--- a/projects/Allwinner/linux/linux.aarch64.conf
+++ b/projects/Allwinner/linux/linux.aarch64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm64 6.1.0-rc6 Kernel Configuration
+# Linux/arm64 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -338,6 +338,7 @@ CONFIG_ARCH_SUNXI=y
 #
 # ARM errata workarounds via the alternatives framework
 #
+# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set
 CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
 CONFIG_ARM64_ERRATUM_826319=y
 CONFIG_ARM64_ERRATUM_827319=y
@@ -364,6 +365,7 @@ CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y
 # CONFIG_ARM64_ERRATUM_2054223 is not set
 # CONFIG_ARM64_ERRATUM_2067961 is not set
 # CONFIG_ARM64_ERRATUM_2441009 is not set
+# CONFIG_ARM64_ERRATUM_2966298 is not set
 # CONFIG_CAVIUM_ERRATUM_22375 is not set
 # CONFIG_CAVIUM_ERRATUM_23154 is not set
 # CONFIG_CAVIUM_ERRATUM_27456 is not set
@@ -853,6 +855,7 @@ CONFIG_SECRETMEM=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1709,7 +1712,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 CONFIG_SCSI_CONSTANTS=y
@@ -3660,10 +3663,7 @@ CONFIG_MEDIA_ATTACH=y
 # IR I2C driver auto-selected by 'Autoselect ancillary drivers'
 #
 CONFIG_VIDEO_IR_I2C=y
-
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -3730,7 +3730,6 @@ CONFIG_VIDEO_OV7640=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -6460,7 +6459,7 @@ CONFIG_CIFS_DEBUG=y
 # CONFIG_CIFS_SWN_UPCALL is not set
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/Allwinner/linux/linux.arm.conf b/projects/Allwinner/linux/linux.arm.conf
index 76fec8c764..863d613a86 100644
--- a/projects/Allwinner/linux/linux.arm.conf
+++ b/projects/Allwinner/linux/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 6.1.0-rc6 Kernel Configuration
+# Linux/arm 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="armv7ve-libreelec-linux-gnueabihf-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -611,6 +611,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y
 CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
 CONFIG_ARCH_HAS_KEEPINITRD=y
 CONFIG_ARCH_HAS_SET_MEMORY=y
+CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y
 CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
 CONFIG_ARCH_32BIT_OFF_T=y
 CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
@@ -821,6 +822,7 @@ CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1574,7 +1576,7 @@ CONFIG_SCSI_PROC_FS=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 CONFIG_SCSI_CONSTANTS=y
@@ -3398,9 +3400,7 @@ CONFIG_MEDIA_ATTACH=y
 #
 CONFIG_VIDEO_IR_I2C=y
 
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -3467,7 +3467,6 @@ CONFIG_VIDEO_OV7640=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -6060,7 +6059,7 @@ CONFIG_CIFS_DEBUG=y
 # CONFIG_CIFS_SWN_UPCALL is not set
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch b/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch
new file mode 100644
index 0000000000..218fdae116
--- /dev/null
+++ b/projects/Allwinner/patches/linux/0014-drm_call_drm_atomic_helper_shutdown_at_shutdown.patch
@@ -0,0 +1,61 @@
+Subject: [PATCH] drm: Call drm_atomic_helper_shutdown() at shutdown time for misc drivers
+From: Douglas Anderson <dianders@chromium.org>
+Date: Fri, 01 Sep 2023 16:39:53 -0700
+MIME-Version: 1.0
+Content-Type: text/plain; charset="utf-8"
+Content-Transfer-Encoding: 7bit
+
+Based on grepping through the source code these drivers appear to be
+missing a call to drm_atomic_helper_shutdown() at system shutdown
+time. Among other things, this means that if a panel is in use that it
+won't be cleanly powered off at system shutdown time.
+
+The fact that we should call drm_atomic_helper_shutdown() in the case
+of OS shutdown/restart comes straight out of the kernel doc "driver
+instance overview" in drm_drv.c.
+
+All of the drivers in this patch were fairly straightforward to fix
+since they already had a call to drm_atomic_helper_shutdown() at
+remove/unbind time but were just lacking one at system shutdown. The
+only hitch is that some of these drivers use the component model to
+register/unregister their DRM devices. The shutdown callback is part
+of the original device. The typical solution here, based on how other
+DRM drivers do this, is to keep track of whether the device is bound
+based on drvdata. In most cases the drvdata is the drm_device, so we
+can just make sure it is NULL when the device is not bound. In some
+drivers, this required minor code changes. To make things simpler,
+drm_atomic_helper_shutdown() has been modified to consider a NULL
+drm_device as a noop in the patch ("drm/atomic-helper:
+drm_atomic_helper_shutdown(NULL) should be a noop").
+
+Suggested-by: Maxime Ripard <mripard@kernel.org>
+Signed-off-by: Douglas Anderson <dianders@chromium.org>
+Acked-by: Maxime Ripard <mripard@kernel.org>
+Link: https://lore.kernel.org/r/20230901163944.RFT.2.I9115e5d094a43e687978b0699cc1fe9f2a3452ea@changeid
+---
+<snip>
+diff --git a/drivers/gpu/drm/sun4i/sun4i_drv.c b/drivers/gpu/drm/sun4i/sun4i_drv.c
+index 6a8dfc022d3c..35d7a7ffd208 100644
+--- a/drivers/gpu/drm/sun4i/sun4i_drv.c
++++ b/drivers/gpu/drm/sun4i/sun4i_drv.c
+@@ -413,6 +413,11 @@ static void sun4i_drv_remove(struct platform_device *pdev)
+ 	component_master_del(&pdev->dev, &sun4i_drv_master_ops);
+ }
+ 
++static void sun4i_drv_shutdown(struct platform_device *pdev)
++{
++	drm_atomic_helper_shutdown(platform_get_drvdata(pdev));
++}
++
+ static const struct of_device_id sun4i_drv_of_table[] = {
+ 	{ .compatible = "allwinner,sun4i-a10-display-engine" },
+ 	{ .compatible = "allwinner,sun5i-a10s-display-engine" },
+@@ -437,6 +442,7 @@ MODULE_DEVICE_TABLE(of, sun4i_drv_of_table);
+ static struct platform_driver sun4i_drv_platform_driver = {
+ 	.probe		= sun4i_drv_probe,
+ 	.remove		= sun4i_drv_remove,
++	.shutdown	= sun4i_drv_shutdown,
+ 	.driver		= {
+ 		.name		= "sun4i-drm",
+ 		.of_match_table	= sun4i_drv_of_table,
+<snip>
diff --git a/projects/Amlogic/linux/linux.aarch64.conf b/projects/Amlogic/linux/linux.aarch64.conf
index 3122a3c071..75c4284a5d 100644
--- a/projects/Amlogic/linux/linux.aarch64.conf
+++ b/projects/Amlogic/linux/linux.aarch64.conf
@@ -1770,7 +1770,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
diff --git a/projects/Generic/linux/linux.x86_64.conf b/projects/Generic/linux/linux.x86_64.conf
index 73a6b53fa5..3e55a1e821 100644
--- a/projects/Generic/linux/linux.x86_64.conf
+++ b/projects/Generic/linux/linux.x86_64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/x86 6.1.30 Kernel Configuration
+# Linux/x86 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="x86_64-libreelec-linux-gnu-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -472,7 +472,9 @@ CONFIG_RETHUNK=y
 CONFIG_CPU_UNRET_ENTRY=y
 CONFIG_CPU_IBPB_ENTRY=y
 CONFIG_CPU_IBRS_ENTRY=y
+CONFIG_CPU_SRSO=y
 # CONFIG_SLS is not set
+# CONFIG_GDS_FORCE_MITIGATION is not set
 CONFIG_ARCH_HAS_ADD_PAGES=y
 CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y
 
@@ -666,6 +668,7 @@ CONFIG_GENERIC_SMP_IDLE_THREAD=y
 CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
 CONFIG_ARCH_HAS_SET_MEMORY=y
 CONFIG_ARCH_HAS_SET_DIRECT_MAP=y
+CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y
 CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
 CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y
 CONFIG_ARCH_WANTS_NO_INSTR=y
@@ -969,6 +972,7 @@ CONFIG_SECRETMEM=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1853,7 +1857,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-CONFIG_CHR_DEV_SG=y
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -4066,9 +4070,7 @@ CONFIG_MEDIA_ATTACH=y
 #
 CONFIG_VIDEO_IR_I2C=m
 
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -4130,7 +4132,6 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -6402,7 +6403,7 @@ CONFIG_CIFS_DEBUG=y
 CONFIG_CIFS_FSCACHE=y
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/NXP/devices/iMX6/linux/linux.arm.conf b/projects/NXP/devices/iMX6/linux/linux.arm.conf
index 48ba320fd5..29c5df0dde 100644
--- a/projects/NXP/devices/iMX6/linux/linux.arm.conf
+++ b/projects/NXP/devices/iMX6/linux/linux.arm.conf
@@ -1818,7 +1818,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 # CONFIG_BLK_DEV_SR is not set
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_CHR_DEV_SCH is not set
 CONFIG_SCSI_CONSTANTS=y
diff --git a/projects/NXP/devices/iMX8/linux/linux.aarch64.conf b/projects/NXP/devices/iMX8/linux/linux.aarch64.conf
index 49ff57d6da..c0ee96e16a 100644
--- a/projects/NXP/devices/iMX8/linux/linux.aarch64.conf
+++ b/projects/NXP/devices/iMX8/linux/linux.aarch64.conf
@@ -1775,7 +1775,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
diff --git a/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf b/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf
index 3df420b438..4e3f207144 100644
--- a/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf
+++ b/projects/Qualcomm/devices/Dragonboard/linux/linux.aarch64.conf
@@ -1933,7 +1933,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 # CONFIG_BLK_DEV_SR is not set
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
diff --git a/projects/RPi/devices/RPi/linux/linux.arm.conf b/projects/RPi/devices/RPi/linux/linux.arm.conf
index 76ab802670..8f7f0f32e6 100644
--- a/projects/RPi/devices/RPi/linux/linux.arm.conf
+++ b/projects/RPi/devices/RPi/linux/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 6.1.38 Kernel Configuration
+# Linux/arm 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
 CONFIG_CC_IS_GCC=y
@@ -496,6 +496,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y
 CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
 CONFIG_ARCH_HAS_KEEPINITRD=y
 CONFIG_ARCH_HAS_SET_MEMORY=y
+CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y
 CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
 CONFIG_ARCH_32BIT_OFF_T=y
 CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
@@ -1065,8 +1066,6 @@ CONFIG_NET_SCH_FQ_CODEL=y
 # CONFIG_NET_CLS_ROUTE4 is not set
 # CONFIG_NET_CLS_FW is not set
 # CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
 # CONFIG_NET_CLS_FLOW is not set
 # CONFIG_NET_CLS_CGROUP is not set
 # CONFIG_NET_CLS_BPF is not set
@@ -1396,7 +1395,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -1943,6 +1942,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
 # CONFIG_INPUT_DRV260X_HAPTICS is not set
 # CONFIG_INPUT_DRV2665_HAPTICS is not set
 # CONFIG_INPUT_DRV2667_HAPTICS is not set
+# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
 CONFIG_RMI4_CORE=y
 # CONFIG_RMI4_I2C is not set
 # CONFIG_RMI4_SPI is not set
@@ -1981,7 +1981,6 @@ CONFIG_SERIO_SERPORT=y
 CONFIG_BRCM_CHAR_DRIVERS=y
 CONFIG_BCM2708_VCMEM=y
 CONFIG_BCM_VCIO=y
-CONFIG_BCM2835_DEVGPIOMEM=m
 CONFIG_BCM2835_SMI_DEV=m
 # CONFIG_RPIVID_MEM is not set
 CONFIG_TTY=y
@@ -2066,6 +2065,7 @@ CONFIG_DEVMEM=y
 # CONFIG_XILLYUSB is not set
 CONFIG_RANDOM_TRUST_CPU=y
 CONFIG_RANDOM_TRUST_BOOTLOADER=y
+CONFIG_RASPBERRYPI_GPIOMEM=y
 # end of Character devices
 
 #
@@ -2214,6 +2214,8 @@ CONFIG_GENERIC_PINCONF=y
 # CONFIG_PINCTRL_SINGLE is not set
 # CONFIG_PINCTRL_STMFX is not set
 # CONFIG_PINCTRL_SX150X is not set
+# CONFIG_PINCTRL_RP1 is not set
+# CONFIG_PINCTRL_BCM2712 is not set
 CONFIG_PINCTRL_BCM2835=y
 
 #
@@ -2238,6 +2240,7 @@ CONFIG_GPIO_CDEV_V1=y
 # CONFIG_GPIO_ALTERA is not set
 CONFIG_GPIO_RASPBERRYPI_EXP=y
 CONFIG_GPIO_BCM_VIRT=y
+# CONFIG_GPIO_BRCMSTB is not set
 # CONFIG_GPIO_CADENCE is not set
 # CONFIG_GPIO_DWAPB is not set
 # CONFIG_GPIO_FTGPIO010 is not set
@@ -2881,11 +2884,14 @@ CONFIG_DVB_CORE=m
 # Video4Linux options
 #
 CONFIG_VIDEO_V4L2_I2C=y
+CONFIG_VIDEO_V4L2_SUBDEV_API=y
 # CONFIG_VIDEO_ADV_DEBUG is not set
 # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
 CONFIG_VIDEO_TUNER=m
 CONFIG_V4L2_MEM2MEM_DEV=m
 # CONFIG_V4L2_FLASH_LED_CLASS is not set
+CONFIG_V4L2_FWNODE=m
+CONFIG_V4L2_ASYNC=m
 # end of Video4Linux options
 
 #
@@ -3074,6 +3080,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
 # Qualcomm media platform drivers
 #
 
+#
+# Raspberry Pi media platform drivers
+#
+# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
+# CONFIG_VIDEO_RP1_CFE is not set
+
 #
 # Renesas media platform drivers
 #
@@ -3145,10 +3157,7 @@ CONFIG_MEDIA_ATTACH=y
 # IR I2C driver auto-selected by 'Autoselect ancillary drivers'
 #
 CONFIG_VIDEO_IR_I2C=m
-
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_ARDUCAM_64MP is not set
 # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set
@@ -3167,6 +3176,7 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_IMX335 is not set
 # CONFIG_VIDEO_IMX355 is not set
 # CONFIG_VIDEO_IMX412 is not set
+# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_IMX519 is not set
 # CONFIG_VIDEO_IMX708 is not set
 # CONFIG_VIDEO_MT9M001 is not set
@@ -3221,7 +3231,6 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -3258,7 +3267,6 @@ CONFIG_VIDEO_MSP3400=m
 # CONFIG_VIDEO_TLV320AIC23B is not set
 # CONFIG_VIDEO_TVAUDIO is not set
 # CONFIG_VIDEO_UDA1342 is not set
-# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_VP27SMPX is not set
 # CONFIG_VIDEO_WM8739 is not set
 CONFIG_VIDEO_WM8775=m
@@ -3749,6 +3757,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
 # CONFIG_DRM_V3D is not set
 CONFIG_DRM_VC4=y
 CONFIG_DRM_VC4_HDMI_CEC=y
+# CONFIG_DRM_RP1_DSI is not set
+# CONFIG_DRM_RP1_DPI is not set
+# CONFIG_DRM_RP1_VEC is not set
 # CONFIG_DRM_ETNAVIV is not set
 # CONFIG_DRM_LOGICVC is not set
 # CONFIG_DRM_ARCPGU is not set
@@ -5024,6 +5035,7 @@ CONFIG_PWM_BCM2835=m
 # CONFIG_PWM_FSL_FTM is not set
 # CONFIG_PWM_PCA9685 is not set
 CONFIG_PWM_RASPBERRYPI_POE=m
+# CONFIG_PWM_RP1 is not set
 # CONFIG_PWM_XILINX is not set
 
 #
@@ -5037,6 +5049,7 @@ CONFIG_BRCMSTB_L2_IRQ=y
 
 # CONFIG_IPACK_BUS is not set
 CONFIG_RESET_CONTROLLER=y
+# CONFIG_RESET_BRCMSTB is not set
 # CONFIG_RESET_RASPBERRYPI is not set
 # CONFIG_RESET_SIMPLE is not set
 # CONFIG_RESET_TI_SYSCON is not set
@@ -5052,6 +5065,7 @@ CONFIG_RESET_CONTROLLER=y
 # PHY drivers for Broadcom platforms
 #
 # CONFIG_BCM_KONA_USB2_PHY is not set
+# CONFIG_PHY_BRCM_USB is not set
 # end of PHY drivers for Broadcom platforms
 
 # CONFIG_PHY_CADENCE_TORRENT is not set
@@ -5169,7 +5183,7 @@ CONFIG_DNOTIFY=y
 CONFIG_INOTIFY_USER=y
 CONFIG_FANOTIFY=y
 # CONFIG_QUOTA is not set
-CONFIG_AUTOFS4_FS=y
+# CONFIG_AUTOFS4_FS is not set
 CONFIG_AUTOFS_FS=y
 CONFIG_FUSE_FS=m
 # CONFIG_CUSE is not set
diff --git a/projects/RPi/devices/RPi2/linux/linux.arm.conf b/projects/RPi/devices/RPi2/linux/linux.arm.conf
index 0d85078021..9ac3225cfa 100644
--- a/projects/RPi/devices/RPi2/linux/linux.arm.conf
+++ b/projects/RPi/devices/RPi2/linux/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 6.1.38 Kernel Configuration
+# Linux/arm 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="arm-linux-gnueabihf-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
 CONFIG_CC_IS_GCC=y
@@ -608,6 +608,7 @@ CONFIG_GENERIC_IDLE_POLL_SETUP=y
 CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
 CONFIG_ARCH_HAS_KEEPINITRD=y
 CONFIG_ARCH_HAS_SET_MEMORY=y
+CONFIG_ARCH_HAS_CPU_FINALIZE_INIT=y
 CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
 CONFIG_ARCH_32BIT_OFF_T=y
 CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
@@ -1247,8 +1248,6 @@ CONFIG_NET_CLS=y
 # CONFIG_NET_CLS_ROUTE4 is not set
 # CONFIG_NET_CLS_FW is not set
 # CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
 # CONFIG_NET_CLS_FLOW is not set
 CONFIG_NET_CLS_CGROUP=m
 # CONFIG_NET_CLS_BPF is not set
@@ -1588,7 +1587,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -2168,6 +2167,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
 # CONFIG_INPUT_DRV260X_HAPTICS is not set
 # CONFIG_INPUT_DRV2665_HAPTICS is not set
 # CONFIG_INPUT_DRV2667_HAPTICS is not set
+# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
 CONFIG_RMI4_CORE=y
 # CONFIG_RMI4_I2C is not set
 # CONFIG_RMI4_SPI is not set
@@ -2206,7 +2206,6 @@ CONFIG_SERIO_SERPORT=y
 CONFIG_BRCM_CHAR_DRIVERS=y
 CONFIG_BCM2708_VCMEM=y
 CONFIG_BCM_VCIO=y
-CONFIG_BCM2835_DEVGPIOMEM=m
 CONFIG_BCM2835_SMI_DEV=m
 # CONFIG_RPIVID_MEM is not set
 CONFIG_TTY=y
@@ -2291,6 +2290,7 @@ CONFIG_DEVMEM=y
 # CONFIG_XILLYUSB is not set
 CONFIG_RANDOM_TRUST_CPU=y
 CONFIG_RANDOM_TRUST_BOOTLOADER=y
+CONFIG_RASPBERRYPI_GPIOMEM=y
 # end of Character devices
 
 #
@@ -2439,6 +2439,8 @@ CONFIG_GENERIC_PINCONF=y
 # CONFIG_PINCTRL_SINGLE is not set
 # CONFIG_PINCTRL_STMFX is not set
 # CONFIG_PINCTRL_SX150X is not set
+# CONFIG_PINCTRL_RP1 is not set
+# CONFIG_PINCTRL_BCM2712 is not set
 CONFIG_PINCTRL_BCM2835=y
 
 #
@@ -2463,6 +2465,7 @@ CONFIG_GPIO_CDEV_V1=y
 # CONFIG_GPIO_ALTERA is not set
 CONFIG_GPIO_RASPBERRYPI_EXP=y
 CONFIG_GPIO_BCM_VIRT=y
+# CONFIG_GPIO_BRCMSTB is not set
 # CONFIG_GPIO_CADENCE is not set
 # CONFIG_GPIO_DWAPB is not set
 # CONFIG_GPIO_FTGPIO010 is not set
@@ -3113,11 +3116,14 @@ CONFIG_DVB_CORE=m
 # Video4Linux options
 #
 CONFIG_VIDEO_V4L2_I2C=y
+CONFIG_VIDEO_V4L2_SUBDEV_API=y
 # CONFIG_VIDEO_ADV_DEBUG is not set
 # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
 CONFIG_VIDEO_TUNER=m
 CONFIG_V4L2_MEM2MEM_DEV=m
 # CONFIG_V4L2_FLASH_LED_CLASS is not set
+CONFIG_V4L2_FWNODE=m
+CONFIG_V4L2_ASYNC=m
 # end of Video4Linux options
 
 #
@@ -3306,6 +3312,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
 # Qualcomm media platform drivers
 #
 
+#
+# Raspberry Pi media platform drivers
+#
+# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
+# CONFIG_VIDEO_RP1_CFE is not set
+
 #
 # Renesas media platform drivers
 #
@@ -3377,10 +3389,7 @@ CONFIG_MEDIA_ATTACH=y
 # IR I2C driver auto-selected by 'Autoselect ancillary drivers'
 #
 CONFIG_VIDEO_IR_I2C=m
-
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_ARDUCAM_64MP is not set
 # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set
@@ -3399,6 +3408,7 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_IMX335 is not set
 # CONFIG_VIDEO_IMX355 is not set
 # CONFIG_VIDEO_IMX412 is not set
+# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_IMX519 is not set
 # CONFIG_VIDEO_IMX708 is not set
 # CONFIG_VIDEO_MT9M001 is not set
@@ -3453,7 +3463,6 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -3490,7 +3499,6 @@ CONFIG_VIDEO_MSP3400=m
 # CONFIG_VIDEO_TLV320AIC23B is not set
 # CONFIG_VIDEO_TVAUDIO is not set
 # CONFIG_VIDEO_UDA1342 is not set
-# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_VP27SMPX is not set
 # CONFIG_VIDEO_WM8739 is not set
 CONFIG_VIDEO_WM8775=m
@@ -3982,6 +3990,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
 # CONFIG_DRM_V3D is not set
 CONFIG_DRM_VC4=y
 CONFIG_DRM_VC4_HDMI_CEC=y
+# CONFIG_DRM_RP1_DSI is not set
+# CONFIG_DRM_RP1_DPI is not set
+# CONFIG_DRM_RP1_VEC is not set
 # CONFIG_DRM_ETNAVIV is not set
 # CONFIG_DRM_LOGICVC is not set
 # CONFIG_DRM_ARCPGU is not set
@@ -5260,6 +5271,7 @@ CONFIG_PWM_BCM2835=m
 # CONFIG_PWM_FSL_FTM is not set
 # CONFIG_PWM_PCA9685 is not set
 CONFIG_PWM_RASPBERRYPI_POE=m
+# CONFIG_PWM_RP1 is not set
 # CONFIG_PWM_XILINX is not set
 
 #
@@ -5269,12 +5281,14 @@ CONFIG_IRQCHIP=y
 CONFIG_ARM_GIC=y
 CONFIG_ARM_GIC_MAX_NR=1
 # CONFIG_AL_FIC is not set
+# CONFIG_BCM2712_MIP is not set
 CONFIG_BRCMSTB_L2_IRQ=y
 # CONFIG_XILINX_INTC is not set
 # end of IRQ chip support
 
 # CONFIG_IPACK_BUS is not set
 CONFIG_RESET_CONTROLLER=y
+# CONFIG_RESET_BRCMSTB is not set
 # CONFIG_RESET_RASPBERRYPI is not set
 # CONFIG_RESET_SIMPLE is not set
 # CONFIG_RESET_TI_SYSCON is not set
@@ -5290,6 +5304,7 @@ CONFIG_RESET_CONTROLLER=y
 # PHY drivers for Broadcom platforms
 #
 # CONFIG_BCM_KONA_USB2_PHY is not set
+# CONFIG_PHY_BRCM_USB is not set
 # end of PHY drivers for Broadcom platforms
 
 # CONFIG_PHY_CADENCE_TORRENT is not set
@@ -5411,7 +5426,7 @@ CONFIG_DNOTIFY=y
 CONFIG_INOTIFY_USER=y
 CONFIG_FANOTIFY=y
 # CONFIG_QUOTA is not set
-CONFIG_AUTOFS4_FS=y
+# CONFIG_AUTOFS4_FS is not set
 CONFIG_AUTOFS_FS=y
 CONFIG_FUSE_FS=m
 # CONFIG_CUSE is not set
diff --git a/projects/RPi/devices/RPi4/linux/linux.aarch64.conf b/projects/RPi/devices/RPi4/linux/linux.aarch64.conf
index 17bba563ef..6419a4e477 100644
--- a/projects/RPi/devices/RPi4/linux/linux.aarch64.conf
+++ b/projects/RPi/devices/RPi4/linux/linux.aarch64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm64 6.1.38 Kernel Configuration
+# Linux/arm64 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="aarch64-linux-gnu-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
 CONFIG_CC_IS_GCC=y
@@ -327,6 +327,7 @@ CONFIG_ARCH_BCM2835=y
 #
 # ARM errata workarounds via the alternatives framework
 #
+CONFIG_AMPERE_ERRATUM_AC03_CPU_38=y
 CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
 CONFIG_ARM64_ERRATUM_826319=y
 CONFIG_ARM64_ERRATUM_827319=y
@@ -357,6 +358,7 @@ CONFIG_ARM64_ERRATUM_2054223=y
 CONFIG_ARM64_ERRATUM_2067961=y
 CONFIG_ARM64_ERRATUM_2441009=y
 CONFIG_ARM64_ERRATUM_2457168=y
+CONFIG_ARM64_ERRATUM_2966298=y
 CONFIG_CAVIUM_ERRATUM_22375=y
 CONFIG_CAVIUM_ERRATUM_23154=y
 CONFIG_CAVIUM_ERRATUM_27456=y
@@ -1329,8 +1331,6 @@ CONFIG_NET_CLS=y
 # CONFIG_NET_CLS_ROUTE4 is not set
 # CONFIG_NET_CLS_FW is not set
 # CONFIG_NET_CLS_U32 is not set
-# CONFIG_NET_CLS_RSVP is not set
-# CONFIG_NET_CLS_RSVP6 is not set
 # CONFIG_NET_CLS_FLOW is not set
 CONFIG_NET_CLS_CGROUP=m
 # CONFIG_NET_CLS_BPF is not set
@@ -1798,7 +1798,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -2693,6 +2693,7 @@ CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
 # CONFIG_INPUT_DRV260X_HAPTICS is not set
 # CONFIG_INPUT_DRV2665_HAPTICS is not set
 # CONFIG_INPUT_DRV2667_HAPTICS is not set
+# CONFIG_INPUT_RASPBERRYPI_BUTTON is not set
 CONFIG_RMI4_CORE=y
 # CONFIG_RMI4_I2C is not set
 # CONFIG_RMI4_SPI is not set
@@ -2732,7 +2733,6 @@ CONFIG_SERIO_SERPORT=y
 CONFIG_BRCM_CHAR_DRIVERS=y
 CONFIG_BCM2708_VCMEM=y
 CONFIG_BCM_VCIO=y
-CONFIG_BCM2835_DEVGPIOMEM=m
 CONFIG_BCM2835_SMI_DEV=m
 # CONFIG_RPIVID_MEM is not set
 CONFIG_TTY=y
@@ -2826,6 +2826,7 @@ CONFIG_DEVPORT=y
 # CONFIG_XILLYUSB is not set
 CONFIG_RANDOM_TRUST_CPU=y
 CONFIG_RANDOM_TRUST_BOOTLOADER=y
+CONFIG_RASPBERRYPI_GPIOMEM=y
 # end of Character devices
 
 #
@@ -2999,6 +3000,8 @@ CONFIG_GENERIC_PINCONF=y
 # CONFIG_PINCTRL_SINGLE is not set
 # CONFIG_PINCTRL_STMFX is not set
 # CONFIG_PINCTRL_SX150X is not set
+# CONFIG_PINCTRL_RP1 is not set
+# CONFIG_PINCTRL_BCM2712 is not set
 CONFIG_PINCTRL_BCM2835=y
 
 #
@@ -3022,6 +3025,7 @@ CONFIG_GPIO_CDEV_V1=y
 # CONFIG_GPIO_ALTERA is not set
 CONFIG_GPIO_RASPBERRYPI_EXP=y
 CONFIG_GPIO_BCM_VIRT=y
+# CONFIG_GPIO_BRCMSTB is not set
 # CONFIG_GPIO_CADENCE is not set
 # CONFIG_GPIO_DWAPB is not set
 # CONFIG_GPIO_EXAR is not set
@@ -3561,6 +3565,7 @@ CONFIG_MFD_WM5102=y
 # CONFIG_MFD_QCOM_PM8008 is not set
 # CONFIG_RAVE_SP_CORE is not set
 # CONFIG_MFD_INTEL_M10_BMC is not set
+# CONFIG_MFD_RP1 is not set
 # CONFIG_MFD_RSMU_I2C is not set
 # CONFIG_MFD_RSMU_SPI is not set
 # end of Multifunction device drivers
@@ -3703,11 +3708,14 @@ CONFIG_DVB_CORE=m
 # Video4Linux options
 #
 CONFIG_VIDEO_V4L2_I2C=y
+CONFIG_VIDEO_V4L2_SUBDEV_API=y
 # CONFIG_VIDEO_ADV_DEBUG is not set
 # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
 CONFIG_VIDEO_TUNER=m
 CONFIG_V4L2_MEM2MEM_DEV=m
 # CONFIG_V4L2_FLASH_LED_CLASS is not set
+CONFIG_V4L2_FWNODE=m
+CONFIG_V4L2_ASYNC=m
 # end of Video4Linux options
 
 #
@@ -3900,6 +3908,12 @@ CONFIG_MEDIA_PLATFORM_DRIVERS=y
 # Qualcomm media platform drivers
 #
 
+#
+# Raspberry Pi media platform drivers
+#
+# CONFIG_VIDEO_RASPBERRYPI_PISP_BE is not set
+# CONFIG_VIDEO_RP1_CFE is not set
+
 #
 # Renesas media platform drivers
 #
@@ -3971,10 +3985,7 @@ CONFIG_MEDIA_ATTACH=y
 # IR I2C driver auto-selected by 'Autoselect ancillary drivers'
 #
 CONFIG_VIDEO_IR_I2C=m
-
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_ARDUCAM_64MP is not set
 # CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set
@@ -3993,6 +4004,7 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_IMX335 is not set
 # CONFIG_VIDEO_IMX355 is not set
 # CONFIG_VIDEO_IMX412 is not set
+# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_IMX519 is not set
 # CONFIG_VIDEO_IMX708 is not set
 # CONFIG_VIDEO_MT9M001 is not set
@@ -4047,7 +4059,6 @@ CONFIG_VIDEO_IR_I2C=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -4084,7 +4095,6 @@ CONFIG_VIDEO_MSP3400=m
 # CONFIG_VIDEO_TLV320AIC23B is not set
 # CONFIG_VIDEO_TVAUDIO is not set
 # CONFIG_VIDEO_UDA1342 is not set
-# CONFIG_VIDEO_IMX477 is not set
 # CONFIG_VIDEO_VP27SMPX is not set
 # CONFIG_VIDEO_WM8739 is not set
 CONFIG_VIDEO_WM8775=m
@@ -4577,6 +4587,9 @@ CONFIG_DRM_TOSHIBA_TC358762=y
 CONFIG_DRM_V3D=y
 CONFIG_DRM_VC4=y
 CONFIG_DRM_VC4_HDMI_CEC=y
+# CONFIG_DRM_RP1_DSI is not set
+# CONFIG_DRM_RP1_DPI is not set
+# CONFIG_DRM_RP1_VEC is not set
 # CONFIG_DRM_ETNAVIV is not set
 # CONFIG_DRM_HISI_HIBMC is not set
 # CONFIG_DRM_HISI_KIRIN is not set
@@ -5837,6 +5850,7 @@ CONFIG_COMMON_CLK=y
 
 # CONFIG_LMK04832 is not set
 # CONFIG_COMMON_CLK_MAX9485 is not set
+# CONFIG_COMMON_CLK_RP1 is not set
 CONFIG_COMMON_CLK_HIFIBERRY_DACPLUSHD=m
 CONFIG_COMMON_CLK_HIFIBERRY_DACPRO=m
 # CONFIG_COMMON_CLK_SI5341 is not set
@@ -5980,6 +5994,7 @@ CONFIG_PWM_BCM2835=m
 # CONFIG_PWM_FSL_FTM is not set
 # CONFIG_PWM_PCA9685 is not set
 CONFIG_PWM_RASPBERRYPI_POE=m
+# CONFIG_PWM_RP1 is not set
 # CONFIG_PWM_XILINX is not set
 
 #
@@ -5993,6 +6008,7 @@ CONFIG_ARM_GIC_V3=y
 CONFIG_ARM_GIC_V3_ITS=y
 CONFIG_ARM_GIC_V3_ITS_PCI=y
 # CONFIG_AL_FIC is not set
+# CONFIG_BCM2712_MIP is not set
 CONFIG_BRCMSTB_L2_IRQ=y
 # CONFIG_XILINX_INTC is not set
 CONFIG_PARTITION_PERCPU=y
@@ -6000,6 +6016,7 @@ CONFIG_PARTITION_PERCPU=y
 
 # CONFIG_IPACK_BUS is not set
 CONFIG_RESET_CONTROLLER=y
+# CONFIG_RESET_BRCMSTB is not set
 CONFIG_RESET_RASPBERRYPI=y
 CONFIG_RESET_SIMPLE=y
 # CONFIG_RESET_TI_SYSCON is not set
@@ -6016,6 +6033,7 @@ CONFIG_RESET_SIMPLE=y
 # PHY drivers for Broadcom platforms
 #
 # CONFIG_BCM_KONA_USB2_PHY is not set
+# CONFIG_PHY_BRCM_USB is not set
 # end of PHY drivers for Broadcom platforms
 
 # CONFIG_PHY_CADENCE_TORRENT is not set
@@ -6145,7 +6163,7 @@ CONFIG_DNOTIFY=y
 CONFIG_INOTIFY_USER=y
 CONFIG_FANOTIFY=y
 # CONFIG_QUOTA is not set
-CONFIG_AUTOFS4_FS=y
+# CONFIG_AUTOFS4_FS is not set
 CONFIG_AUTOFS_FS=y
 CONFIG_FUSE_FS=m
 # CONFIG_CUSE is not set
diff --git a/projects/RPi/devices/RPi5/config/config.txt b/projects/RPi/devices/RPi5/config/config.txt
new file mode 100644
index 0000000000..e678c92b83
--- /dev/null
+++ b/projects/RPi/devices/RPi5/config/config.txt
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2009-2014 Stephan Raue (stephan@openelec.tv)
+# Copyright (C) 2016-present Team LibreELEC (https://libreelec.tv)
+################################################################################
+# Bootloader configuration
+# config.txt version v1 (do not remove or change this line!)
+################################################################################
+# For more options and information see
+# http://rpf.io/configtxt
+################################################################################
+
+# Don't send initial active source message.
+# Avoids bringing CEC (enabled TV) out of standby and channel switch when
+# rebooting.
+hdmi_ignore_cec_init=1
+
+[all]
+################################################################################
+# Use distroconfig-composite.txt instead of distroconfig.txt to enable
+# composite video output.
+# The composite video mode needs to be configured in cmdline.txt:
+# For PAL add: video=Composite-1:720x576@50ie
+# For NTSC add: video=Composite-1:720x480@60ie
+################################################################################
+include distroconfig.txt
+#include distroconfig-composite.txt
+
+# uncomment to enable infrared remote receiver connected to GPIO 18
+#dtoverlay=gpio-ir,gpio_pin=18
+
diff --git a/projects/RPi/devices/RPi5/config/distroconfig-composite.txt b/projects/RPi/devices/RPi5/config/distroconfig-composite.txt
new file mode 100644
index 0000000000..26625be562
--- /dev/null
+++ b/projects/RPi/devices/RPi5/config/distroconfig-composite.txt
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2023-present Team LibreELEC (https://libreelec.tv)
+
+# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING!
+arm_boost=1
+arm_64bit=1
+kernel=kernel.img
+display_auto_detect=1
+enable_tvout=1
+dtoverlay=vc4-kms-v3d,cma-512,composite=1
+dtoverlay=
+disable_overscan=1
+disable_fw_kms_setup=1
+max_framebuffers=0
diff --git a/projects/RPi/devices/RPi5/config/distroconfig.txt b/projects/RPi/devices/RPi5/config/distroconfig.txt
new file mode 100644
index 0000000000..10df6ca5e3
--- /dev/null
+++ b/projects/RPi/devices/RPi5/config/distroconfig.txt
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+# Copyright (C) 2019-present Team LibreELEC (https://libreelec.tv)
+
+# WARNING: DO NOT EDIT THIS FILE - IT WILL BE OVERWRITTEN WHEN UPGRADING!
+arm_boost=1
+arm_64bit=1
+kernel=kernel.img
+display_auto_detect=1
+dtoverlay=vc4-kms-v3d,cma-512
+dtoverlay=
+disable_overscan=1
+disable_fw_kms_setup=1
+max_framebuffers=0
diff --git a/projects/RPi/devices/RPi5/kodi/appliance.xml b/projects/RPi/devices/RPi5/kodi/appliance.xml
new file mode 100644
index 0000000000..60b5e99502
--- /dev/null
+++ b/projects/RPi/devices/RPi5/kodi/appliance.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<settings version="1">
+
+  <section id="system">
+    <category id="audio">
+      <group id="1">
+        <setting id="audiooutput.audiodevice">
+          <default>ALSA:hdmi:CARD=vc4hdmi0,DEV=0</default>
+        </setting>
+      </group>
+    </category>
+  </section>
+
+</settings>
diff --git a/projects/RPi/devices/RPi5/linux/linux.aarch64.conf b/projects/RPi/devices/RPi5/linux/linux.aarch64.conf
new file mode 100644
index 0000000000..74562c1641
--- /dev/null
+++ b/projects/RPi/devices/RPi5/linux/linux.aarch64.conf
@@ -0,0 +1,7144 @@
+#
+# Automatically generated file; DO NOT EDIT.
+# Linux/arm64 6.1.57 Kernel Configuration
+#
+CONFIG_CC_VERSION_TEXT="aarch64-linux-gnu-gcc (GCC) 13.0.0 20220604 (experimental) [master revision aec868578d8515763d75693c1fdfbc30ff0a1e68]"
+CONFIG_CC_IS_GCC=y
+CONFIG_GCC_VERSION=130000
+CONFIG_CLANG_VERSION=0
+CONFIG_AS_IS_GNU=y
+CONFIG_AS_VERSION=23850
+CONFIG_LD_IS_BFD=y
+CONFIG_LD_VERSION=23850
+CONFIG_LLD_VERSION=0
+CONFIG_CC_CAN_LINK=y
+CONFIG_CC_CAN_LINK_STATIC=y
+CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y
+CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y
+CONFIG_CC_HAS_ASM_INLINE=y
+CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y
+CONFIG_PAHOLE_VERSION=0
+CONFIG_IRQ_WORK=y
+CONFIG_BUILDTIME_TABLE_SORT=y
+CONFIG_THREAD_INFO_IN_TASK=y
+
+#
+# General setup
+#
+CONFIG_INIT_ENV_ARG_LIMIT=32
+# CONFIG_COMPILE_TEST is not set
+# CONFIG_WERROR is not set
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_BUILD_SALT=""
+CONFIG_DEFAULT_INIT=""
+CONFIG_DEFAULT_HOSTNAME="@DISTRONAME@"
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+CONFIG_SYSVIPC_COMPAT=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
+# CONFIG_WATCH_QUEUE is not set
+CONFIG_CROSS_MEMORY_ATTACH=y
+# CONFIG_USELIB is not set
+# CONFIG_AUDIT is not set
+CONFIG_HAVE_ARCH_AUDITSYSCALL=y
+
+#
+# IRQ subsystem
+#
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_IRQ_SHOW=y
+CONFIG_GENERIC_IRQ_SHOW_LEVEL=y
+CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y
+CONFIG_HARDIRQS_SW_RESEND=y
+CONFIG_GENERIC_IRQ_CHIP=y
+CONFIG_IRQ_DOMAIN=y
+CONFIG_IRQ_DOMAIN_HIERARCHY=y
+CONFIG_GENERIC_IRQ_IPI=y
+CONFIG_GENERIC_MSI_IRQ=y
+CONFIG_GENERIC_MSI_IRQ_DOMAIN=y
+CONFIG_IRQ_MSI_IOMMU=y
+CONFIG_IRQ_FORCED_THREADING=y
+CONFIG_SPARSE_IRQ=y
+# CONFIG_GENERIC_IRQ_DEBUGFS is not set
+# end of IRQ subsystem
+
+CONFIG_GENERIC_TIME_VSYSCALL=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_ARCH_HAS_TICK_BROADCAST=y
+CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
+CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y
+CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y
+CONFIG_CONTEXT_TRACKING=y
+CONFIG_CONTEXT_TRACKING_IDLE=y
+
+#
+# Timers subsystem
+#
+CONFIG_TICK_ONESHOT=y
+CONFIG_NO_HZ_COMMON=y
+# CONFIG_HZ_PERIODIC is not set
+CONFIG_NO_HZ_IDLE=y
+# CONFIG_NO_HZ_FULL is not set
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+# end of Timers subsystem
+
+CONFIG_BPF=y
+CONFIG_HAVE_EBPF_JIT=y
+CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
+
+#
+# BPF subsystem
+#
+CONFIG_BPF_SYSCALL=y
+# CONFIG_BPF_JIT is not set
+# CONFIG_BPF_UNPRIV_DEFAULT_OFF is not set
+# CONFIG_BPF_PRELOAD is not set
+# end of BPF subsystem
+
+CONFIG_PREEMPT_VOLUNTARY_BUILD=y
+# CONFIG_PREEMPT_NONE is not set
+CONFIG_PREEMPT_VOLUNTARY=y
+# CONFIG_PREEMPT is not set
+# CONFIG_PREEMPT_DYNAMIC is not set
+
+#
+# CPU/Task time and stats accounting
+#
+CONFIG_TICK_CPU_ACCOUNTING=y
+# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set
+# CONFIG_IRQ_TIME_ACCOUNTING is not set
+# CONFIG_BSD_PROCESS_ACCT is not set
+# CONFIG_TASKSTATS is not set
+# CONFIG_PSI is not set
+# end of CPU/Task time and stats accounting
+
+CONFIG_CPU_ISOLATION=y
+
+#
+# RCU Subsystem
+#
+CONFIG_TREE_RCU=y
+# CONFIG_RCU_EXPERT is not set
+CONFIG_SRCU=y
+CONFIG_TREE_SRCU=y
+CONFIG_TASKS_RCU_GENERIC=y
+CONFIG_TASKS_RUDE_RCU=y
+CONFIG_TASKS_TRACE_RCU=y
+CONFIG_RCU_STALL_COMMON=y
+CONFIG_RCU_NEED_SEGCBLIST=y
+# end of RCU Subsystem
+
+CONFIG_IKCONFIG=m
+CONFIG_IKCONFIG_PROC=y
+# CONFIG_IKHEADERS is not set
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
+CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13
+# CONFIG_PRINTK_INDEX is not set
+CONFIG_GENERIC_SCHED_CLOCK=y
+
+#
+# Scheduler features
+#
+# end of Scheduler features
+
+CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y
+CONFIG_CC_HAS_INT128=y
+CONFIG_CC_IMPLICIT_FALLTHROUGH="-Wimplicit-fallthrough=5"
+CONFIG_GCC11_NO_ARRAY_BOUNDS=y
+CONFIG_CC_NO_ARRAY_BOUNDS=y
+CONFIG_ARCH_SUPPORTS_INT128=y
+CONFIG_CGROUPS=y
+CONFIG_PAGE_COUNTER=y
+# CONFIG_CGROUP_FAVOR_DYNMODS is not set
+CONFIG_MEMCG=y
+CONFIG_MEMCG_KMEM=y
+CONFIG_BLK_CGROUP=y
+CONFIG_CGROUP_WRITEBACK=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+# CONFIG_RT_GROUP_SCHED is not set
+CONFIG_CGROUP_PIDS=y
+# CONFIG_CGROUP_RDMA is not set
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_PROC_PID_CPUSET=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_CGROUP_BPF=y
+# CONFIG_CGROUP_MISC is not set
+# CONFIG_CGROUP_DEBUG is not set
+CONFIG_SOCK_CGROUP_DATA=y
+CONFIG_NAMESPACES=y
+CONFIG_UTS_NS=y
+CONFIG_TIME_NS=y
+CONFIG_IPC_NS=y
+CONFIG_USER_NS=y
+CONFIG_PID_NS=y
+CONFIG_NET_NS=y
+# CONFIG_CHECKPOINT_RESTORE is not set
+# CONFIG_SCHED_AUTOGROUP is not set
+# CONFIG_SYSFS_DEPRECATED is not set
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE="@INITRAMFS_SOURCE@"
+CONFIG_INITRAMFS_ROOT_UID=0
+CONFIG_INITRAMFS_ROOT_GID=0
+# CONFIG_RD_GZIP is not set
+# CONFIG_RD_BZIP2 is not set
+# CONFIG_RD_LZMA is not set
+# CONFIG_RD_XZ is not set
+# CONFIG_RD_LZO is not set
+# CONFIG_RD_LZ4 is not set
+# CONFIG_RD_ZSTD is not set
+CONFIG_INITRAMFS_COMPRESSION_NONE=y
+CONFIG_BOOT_CONFIG=y
+# CONFIG_BOOT_CONFIG_EMBED is not set
+CONFIG_INITRAMFS_PRESERVE_MTIME=y
+CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y
+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
+CONFIG_LD_ORPHAN_WARN=y
+CONFIG_SYSCTL=y
+CONFIG_HAVE_UID16=y
+CONFIG_SYSCTL_EXCEPTION_TRACE=y
+CONFIG_EXPERT=y
+CONFIG_UID16=y
+CONFIG_MULTIUSER=y
+# CONFIG_SGETMASK_SYSCALL is not set
+# CONFIG_SYSFS_SYSCALL is not set
+CONFIG_FHANDLE=y
+CONFIG_POSIX_TIMERS=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_FUTEX_PI=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_IO_URING=y
+CONFIG_ADVISE_SYSCALLS=y
+CONFIG_MEMBARRIER=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_ALL is not set
+CONFIG_KALLSYMS_BASE_RELATIVE=y
+CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y
+CONFIG_KCMP=y
+CONFIG_RSEQ=y
+# CONFIG_DEBUG_RSEQ is not set
+CONFIG_EMBEDDED=y
+CONFIG_HAVE_PERF_EVENTS=y
+# CONFIG_PC104 is not set
+
+#
+# Kernel Performance Events And Counters
+#
+CONFIG_PERF_EVENTS=y
+# CONFIG_DEBUG_PERF_USE_VMALLOC is not set
+# end of Kernel Performance Events And Counters
+
+CONFIG_SYSTEM_DATA_VERIFICATION=y
+CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
+# end of General setup
+
+CONFIG_ARM64=y
+CONFIG_GCC_SUPPORTS_DYNAMIC_FTRACE_WITH_REGS=y
+CONFIG_64BIT=y
+CONFIG_MMU=y
+CONFIG_ARM64_PAGE_SHIFT=12
+CONFIG_ARM64_CONT_PTE_SHIFT=4
+CONFIG_ARM64_CONT_PMD_SHIFT=4
+CONFIG_ARCH_MMAP_RND_BITS_MIN=18
+CONFIG_ARCH_MMAP_RND_BITS_MAX=24
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=11
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_CSUM=y
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE=y
+CONFIG_SMP=y
+CONFIG_KERNEL_MODE_NEON=y
+CONFIG_FIX_EARLYCON_MEM=y
+CONFIG_PGTABLE_LEVELS=3
+CONFIG_ARCH_SUPPORTS_UPROBES=y
+CONFIG_ARCH_PROC_KCORE_TEXT=y
+
+#
+# Platform selection
+#
+# CONFIG_ARCH_ACTIONS is not set
+# CONFIG_ARCH_SUNXI is not set
+# CONFIG_ARCH_ALPINE is not set
+# CONFIG_ARCH_APPLE is not set
+CONFIG_ARCH_BCM=y
+CONFIG_ARCH_BCM2835=y
+# CONFIG_ARCH_BCM_IPROC is not set
+# CONFIG_ARCH_BCMBCA is not set
+CONFIG_ARCH_BRCMSTB=y
+# CONFIG_ARCH_BERLIN is not set
+# CONFIG_ARCH_BITMAIN is not set
+# CONFIG_ARCH_EXYNOS is not set
+# CONFIG_ARCH_SPARX5 is not set
+# CONFIG_ARCH_K3 is not set
+# CONFIG_ARCH_LG1K is not set
+# CONFIG_ARCH_HISI is not set
+# CONFIG_ARCH_KEEMBAY is not set
+# CONFIG_ARCH_MEDIATEK is not set
+# CONFIG_ARCH_MESON is not set
+# CONFIG_ARCH_MVEBU is not set
+# CONFIG_ARCH_NXP is not set
+# CONFIG_ARCH_NPCM is not set
+# CONFIG_ARCH_QCOM is not set
+# CONFIG_ARCH_REALTEK is not set
+# CONFIG_ARCH_RENESAS is not set
+# CONFIG_ARCH_ROCKCHIP is not set
+# CONFIG_ARCH_SEATTLE is not set
+# CONFIG_ARCH_INTEL_SOCFPGA is not set
+# CONFIG_ARCH_SYNQUACER is not set
+# CONFIG_ARCH_TEGRA is not set
+# CONFIG_ARCH_SPRD is not set
+# CONFIG_ARCH_THUNDER is not set
+# CONFIG_ARCH_THUNDER2 is not set
+# CONFIG_ARCH_UNIPHIER is not set
+# CONFIG_ARCH_VEXPRESS is not set
+# CONFIG_ARCH_VISCONTI is not set
+# CONFIG_ARCH_XGENE is not set
+# CONFIG_ARCH_ZYNQMP is not set
+# end of Platform selection
+
+#
+# Kernel Features
+#
+
+#
+# ARM errata workarounds via the alternatives framework
+#
+CONFIG_AMPERE_ERRATUM_AC03_CPU_38=y
+CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
+CONFIG_ARM64_ERRATUM_826319=y
+CONFIG_ARM64_ERRATUM_827319=y
+CONFIG_ARM64_ERRATUM_824069=y
+CONFIG_ARM64_ERRATUM_819472=y
+CONFIG_ARM64_ERRATUM_832075=y
+CONFIG_ARM64_ERRATUM_1742098=y
+CONFIG_ARM64_ERRATUM_845719=y
+CONFIG_ARM64_ERRATUM_843419=y
+CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y
+CONFIG_ARM64_ERRATUM_1024718=y
+CONFIG_ARM64_ERRATUM_1418040=y
+CONFIG_ARM64_WORKAROUND_SPECULATIVE_AT=y
+CONFIG_ARM64_ERRATUM_1165522=y
+CONFIG_ARM64_ERRATUM_1319367=y
+CONFIG_ARM64_ERRATUM_1530923=y
+CONFIG_ARM64_WORKAROUND_REPEAT_TLBI=y
+CONFIG_ARM64_ERRATUM_2441007=y
+CONFIG_ARM64_ERRATUM_1286807=y
+CONFIG_ARM64_ERRATUM_1463225=y
+CONFIG_ARM64_ERRATUM_1542419=y
+CONFIG_ARM64_ERRATUM_1508412=y
+CONFIG_ARM64_ERRATUM_2051678=y
+CONFIG_ARM64_ERRATUM_2077057=y
+CONFIG_ARM64_ERRATUM_2658417=y
+CONFIG_ARM64_WORKAROUND_TSB_FLUSH_FAILURE=y
+CONFIG_ARM64_ERRATUM_2054223=y
+CONFIG_ARM64_ERRATUM_2067961=y
+CONFIG_ARM64_ERRATUM_2441009=y
+CONFIG_ARM64_ERRATUM_2457168=y
+CONFIG_ARM64_ERRATUM_2966298=y
+CONFIG_CAVIUM_ERRATUM_22375=y
+CONFIG_CAVIUM_ERRATUM_23154=y
+CONFIG_CAVIUM_ERRATUM_27456=y
+CONFIG_CAVIUM_ERRATUM_30115=y
+CONFIG_CAVIUM_TX2_ERRATUM_219=y
+CONFIG_FUJITSU_ERRATUM_010001=y
+CONFIG_HISILICON_ERRATUM_161600802=y
+CONFIG_QCOM_FALKOR_ERRATUM_1003=y
+CONFIG_QCOM_FALKOR_ERRATUM_1009=y
+CONFIG_QCOM_QDF2400_ERRATUM_0065=y
+CONFIG_QCOM_FALKOR_ERRATUM_E1041=y
+CONFIG_NVIDIA_CARMEL_CNP_ERRATUM=y
+CONFIG_SOCIONEXT_SYNQUACER_PREITS=y
+# end of ARM errata workarounds via the alternatives framework
+
+CONFIG_ARM64_4K_PAGES=y
+# CONFIG_ARM64_16K_PAGES is not set
+# CONFIG_ARM64_64K_PAGES is not set
+CONFIG_ARM64_VA_BITS_39=y
+# CONFIG_ARM64_VA_BITS_48 is not set
+CONFIG_ARM64_VA_BITS=39
+CONFIG_ARM64_PA_BITS_48=y
+CONFIG_ARM64_PA_BITS=48
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_SCHED_MC is not set
+# CONFIG_SCHED_CLUSTER is not set
+# CONFIG_SCHED_SMT is not set
+CONFIG_NR_CPUS=256
+# CONFIG_HOTPLUG_CPU is not set
+# CONFIG_NUMA is not set
+# CONFIG_HZ_100 is not set
+# CONFIG_HZ_250 is not set
+CONFIG_HZ_300=y
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=300
+CONFIG_SCHED_HRTICK=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_HW_PERF_EVENTS=y
+CONFIG_CC_HAVE_SHADOW_CALL_STACK=y
+# CONFIG_PARAVIRT is not set
+# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set
+# CONFIG_KEXEC_FILE is not set
+# CONFIG_CRASH_DUMP is not set
+# CONFIG_XEN is not set
+CONFIG_ARCH_FORCE_MAX_ORDER=11
+CONFIG_UNMAP_KERNEL_AT_EL0=y
+CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY=y
+CONFIG_RODATA_FULL_DEFAULT_ENABLED=y
+# CONFIG_ARM64_SW_TTBR0_PAN is not set
+CONFIG_ARM64_TAGGED_ADDR_ABI=y
+CONFIG_COMPAT=y
+CONFIG_KUSER_HELPERS=y
+# CONFIG_COMPAT_ALIGNMENT_FIXUPS is not set
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+
+#
+# ARMv8.1 architectural features
+#
+CONFIG_ARM64_HW_AFDBM=y
+CONFIG_ARM64_PAN=y
+CONFIG_AS_HAS_LDAPR=y
+CONFIG_AS_HAS_LSE_ATOMICS=y
+CONFIG_ARM64_LSE_ATOMICS=y
+CONFIG_ARM64_USE_LSE_ATOMICS=y
+# end of ARMv8.1 architectural features
+
+#
+# ARMv8.2 architectural features
+#
+CONFIG_AS_HAS_ARMV8_2=y
+CONFIG_AS_HAS_SHA3=y
+# CONFIG_ARM64_PMEM is not set
+CONFIG_ARM64_RAS_EXTN=y
+CONFIG_ARM64_CNP=y
+# end of ARMv8.2 architectural features
+
+#
+# ARMv8.3 architectural features
+#
+CONFIG_ARM64_PTR_AUTH=y
+CONFIG_ARM64_PTR_AUTH_KERNEL=y
+CONFIG_CC_HAS_BRANCH_PROT_PAC_RET=y
+CONFIG_CC_HAS_SIGN_RETURN_ADDRESS=y
+CONFIG_AS_HAS_PAC=y
+CONFIG_AS_HAS_CFI_NEGATE_RA_STATE=y
+# end of ARMv8.3 architectural features
+
+#
+# ARMv8.4 architectural features
+#
+CONFIG_ARM64_AMU_EXTN=y
+CONFIG_AS_HAS_ARMV8_4=y
+CONFIG_ARM64_TLB_RANGE=y
+# end of ARMv8.4 architectural features
+
+#
+# ARMv8.5 architectural features
+#
+CONFIG_AS_HAS_ARMV8_5=y
+CONFIG_ARM64_BTI=y
+CONFIG_CC_HAS_BRANCH_PROT_PAC_RET_BTI=y
+CONFIG_ARM64_E0PD=y
+CONFIG_ARM64_AS_HAS_MTE=y
+CONFIG_ARM64_MTE=y
+# end of ARMv8.5 architectural features
+
+#
+# ARMv8.7 architectural features
+#
+CONFIG_ARM64_EPAN=y
+# end of ARMv8.7 architectural features
+
+CONFIG_ARM64_SVE=y
+CONFIG_ARM64_SME=y
+CONFIG_ARM64_MODULE_PLTS=y
+# CONFIG_ARM64_PSEUDO_NMI is not set
+CONFIG_RELOCATABLE=y
+# CONFIG_RANDOMIZE_BASE is not set
+CONFIG_CC_HAVE_STACKPROTECTOR_SYSREG=y
+CONFIG_STACKPROTECTOR_PER_TASK=y
+CONFIG_ARCH_NR_GPIO=0
+# end of Kernel Features
+
+#
+# Boot options
+#
+CONFIG_CMDLINE=""
+CONFIG_EFI_STUB=y
+CONFIG_EFI=y
+CONFIG_DMI=y
+# end of Boot options
+
+#
+# Power management options
+#
+# CONFIG_SUSPEND is not set
+# CONFIG_HIBERNATION is not set
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_CLK=y
+CONFIG_PM_GENERIC_DOMAINS=y
+# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set
+CONFIG_PM_GENERIC_DOMAINS_OF=y
+CONFIG_CPU_PM=y
+# CONFIG_ENERGY_MODEL is not set
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+# end of Power management options
+
+#
+# CPU Power Management
+#
+
+#
+# CPU Idle
+#
+CONFIG_CPU_IDLE=y
+# CONFIG_CPU_IDLE_GOV_LADDER is not set
+CONFIG_CPU_IDLE_GOV_MENU=y
+# CONFIG_CPU_IDLE_GOV_TEO is not set
+
+#
+# ARM CPU Idle Drivers
+#
+# CONFIG_ARM_PSCI_CPUIDLE is not set
+# end of ARM CPU Idle Drivers
+# end of CPU Idle
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_GOV_ATTR_SET=y
+CONFIG_CPU_FREQ_GOV_COMMON=y
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+# CONFIG_CPU_FREQ_GOV_SCHEDUTIL is not set
+
+#
+# CPU frequency scaling drivers
+#
+CONFIG_CPUFREQ_DT=y
+CONFIG_CPUFREQ_DT_PLATDEV=y
+CONFIG_ARM_BRCMSTB_AVS_CPUFREQ=y
+CONFIG_ARM_RASPBERRYPI_CPUFREQ=y
+# end of CPU Frequency scaling
+# end of CPU Power Management
+
+CONFIG_ARCH_SUPPORTS_ACPI=y
+# CONFIG_ACPI is not set
+CONFIG_HAVE_KVM=y
+# CONFIG_VIRTUALIZATION is not set
+
+#
+# General architecture-dependent options
+#
+CONFIG_ARCH_HAS_SUBPAGE_FAULTS=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+# CONFIG_STATIC_KEYS_SELFTEST is not set
+CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
+CONFIG_KRETPROBES=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_ARCH_CORRECT_STACKTRACE_ON_KRETPROBE=y
+CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y
+CONFIG_HAVE_NMI=y
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_CONTIGUOUS=y
+CONFIG_GENERIC_SMP_IDLE_THREAD=y
+CONFIG_GENERIC_IDLE_POLL_SETUP=y
+CONFIG_ARCH_HAS_FORTIFY_SOURCE=y
+CONFIG_ARCH_HAS_KEEPINITRD=y
+CONFIG_ARCH_HAS_SET_MEMORY=y
+CONFIG_ARCH_HAS_SET_DIRECT_MAP=y
+CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y
+CONFIG_ARCH_WANTS_NO_INSTR=y
+CONFIG_HAVE_ASM_MODVERSIONS=y
+CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y
+CONFIG_HAVE_RSEQ=y
+CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y
+CONFIG_HAVE_HW_BREAKPOINT=y
+CONFIG_HAVE_PERF_REGS=y
+CONFIG_HAVE_PERF_USER_STACK_DUMP=y
+CONFIG_HAVE_ARCH_JUMP_LABEL=y
+CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y
+CONFIG_MMU_GATHER_TABLE_FREE=y
+CONFIG_MMU_GATHER_RCU_TABLE_FREE=y
+CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y
+CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y
+CONFIG_HAVE_CMPXCHG_LOCAL=y
+CONFIG_HAVE_CMPXCHG_DOUBLE=y
+CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y
+CONFIG_HAVE_ARCH_SECCOMP=y
+CONFIG_HAVE_ARCH_SECCOMP_FILTER=y
+CONFIG_SECCOMP=y
+CONFIG_SECCOMP_FILTER=y
+# CONFIG_SECCOMP_CACHE_DEBUG is not set
+CONFIG_HAVE_ARCH_STACKLEAK=y
+CONFIG_HAVE_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR=y
+CONFIG_STACKPROTECTOR_STRONG=y
+CONFIG_ARCH_SUPPORTS_SHADOW_CALL_STACK=y
+# CONFIG_SHADOW_CALL_STACK is not set
+CONFIG_ARCH_SUPPORTS_LTO_CLANG=y
+CONFIG_ARCH_SUPPORTS_LTO_CLANG_THIN=y
+CONFIG_LTO_NONE=y
+CONFIG_ARCH_SUPPORTS_CFI_CLANG=y
+CONFIG_HAVE_CONTEXT_TRACKING_USER=y
+CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y
+CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y
+CONFIG_HAVE_MOVE_PUD=y
+CONFIG_HAVE_MOVE_PMD=y
+CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y
+CONFIG_HAVE_ARCH_HUGE_VMAP=y
+CONFIG_HAVE_ARCH_HUGE_VMALLOC=y
+CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y
+CONFIG_HAVE_MOD_ARCH_SPECIFIC=y
+CONFIG_MODULES_USE_ELF_RELA=y
+CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK=y
+CONFIG_SOFTIRQ_ON_OWN_STACK=y
+CONFIG_ARCH_HAS_ELF_RANDOMIZE=y
+CONFIG_HAVE_ARCH_MMAP_RND_BITS=y
+CONFIG_ARCH_MMAP_RND_BITS=18
+CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=11
+CONFIG_PAGE_SIZE_LESS_THAN_64KB=y
+CONFIG_PAGE_SIZE_LESS_THAN_256KB=y
+CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT=y
+CONFIG_CLONE_BACKWARDS=y
+CONFIG_OLD_SIGSUSPEND3=y
+CONFIG_COMPAT_OLD_SIGACTION=y
+CONFIG_COMPAT_32BIT_TIME=y
+CONFIG_HAVE_ARCH_VMAP_STACK=y
+CONFIG_VMAP_STACK=y
+CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET=y
+CONFIG_RANDOMIZE_KSTACK_OFFSET=y
+# CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT is not set
+CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y
+CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y
+CONFIG_STRICT_MODULE_RWX=y
+CONFIG_HAVE_ARCH_COMPILER_H=y
+CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y
+CONFIG_ARCH_USE_MEMREMAP_PROT=y
+# CONFIG_LOCK_EVENT_COUNTS is not set
+CONFIG_ARCH_HAS_RELR=y
+CONFIG_HAVE_PREEMPT_DYNAMIC=y
+CONFIG_HAVE_PREEMPT_DYNAMIC_KEY=y
+CONFIG_ARCH_WANT_LD_ORPHAN_WARN=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
+CONFIG_ARCH_SUPPORTS_PAGE_TABLE_CHECK=y
+CONFIG_ARCH_HAVE_TRACE_MMIO_ACCESS=y
+
+#
+# GCOV-based kernel profiling
+#
+# CONFIG_GCOV_KERNEL is not set
+CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y
+# end of GCOV-based kernel profiling
+
+CONFIG_HAVE_GCC_PLUGINS=y
+CONFIG_GCC_PLUGINS=y
+# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set
+# end of General architecture-dependent options
+
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+CONFIG_MODULES=y
+# CONFIG_MODULE_FORCE_LOAD is not set
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+# CONFIG_MODULE_UNLOAD_TAINT_TRACKING is not set
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+# CONFIG_MODULE_SIG is not set
+CONFIG_MODULE_COMPRESS_NONE=y
+# CONFIG_MODULE_COMPRESS_GZIP is not set
+# CONFIG_MODULE_COMPRESS_XZ is not set
+# CONFIG_MODULE_COMPRESS_ZSTD is not set
+# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set
+CONFIG_MODPROBE_PATH="/sbin/modprobe"
+# CONFIG_TRIM_UNUSED_KSYMS is not set
+CONFIG_MODULES_TREE_LOOKUP=y
+CONFIG_BLOCK=y
+CONFIG_BLOCK_LEGACY_AUTOLOAD=y
+CONFIG_BLK_CGROUP_RWSTAT=y
+CONFIG_BLK_DEV_BSG_COMMON=y
+CONFIG_BLK_ICQ=y
+CONFIG_BLK_DEV_BSGLIB=y
+# CONFIG_BLK_DEV_INTEGRITY is not set
+# CONFIG_BLK_DEV_ZONED is not set
+CONFIG_BLK_DEV_THROTTLING=y
+# CONFIG_BLK_DEV_THROTTLING_LOW is not set
+# CONFIG_BLK_WBT is not set
+# CONFIG_BLK_CGROUP_IOLATENCY is not set
+# CONFIG_BLK_CGROUP_IOCOST is not set
+# CONFIG_BLK_CGROUP_IOPRIO is not set
+CONFIG_BLK_DEBUG_FS=y
+# CONFIG_BLK_SED_OPAL is not set
+# CONFIG_BLK_INLINE_ENCRYPTION is not set
+
+#
+# Partition Types
+#
+CONFIG_PARTITION_ADVANCED=y
+# CONFIG_ACORN_PARTITION is not set
+# CONFIG_AIX_PARTITION is not set
+# CONFIG_OSF_PARTITION is not set
+# CONFIG_AMIGA_PARTITION is not set
+# CONFIG_ATARI_PARTITION is not set
+CONFIG_MAC_PARTITION=y
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_BSD_DISKLABEL is not set
+# CONFIG_MINIX_SUBPARTITION is not set
+# CONFIG_SOLARIS_X86_PARTITION is not set
+# CONFIG_UNIXWARE_DISKLABEL is not set
+CONFIG_LDM_PARTITION=y
+# CONFIG_LDM_DEBUG is not set
+# CONFIG_SGI_PARTITION is not set
+# CONFIG_ULTRIX_PARTITION is not set
+# CONFIG_SUN_PARTITION is not set
+# CONFIG_KARMA_PARTITION is not set
+CONFIG_EFI_PARTITION=y
+# CONFIG_SYSV68_PARTITION is not set
+# CONFIG_CMDLINE_PARTITION is not set
+# end of Partition Types
+
+CONFIG_BLOCK_COMPAT=y
+CONFIG_BLK_MQ_PCI=y
+CONFIG_BLK_PM=y
+CONFIG_BLOCK_HOLDER_DEPRECATED=y
+CONFIG_BLK_MQ_STACKING=y
+
+#
+# IO Schedulers
+#
+CONFIG_MQ_IOSCHED_DEADLINE=y
+CONFIG_MQ_IOSCHED_KYBER=y
+CONFIG_IOSCHED_BFQ=y
+# CONFIG_BFQ_GROUP_IOSCHED is not set
+# end of IO Schedulers
+
+CONFIG_ASN1=y
+CONFIG_ARCH_INLINE_SPIN_TRYLOCK=y
+CONFIG_ARCH_INLINE_SPIN_TRYLOCK_BH=y
+CONFIG_ARCH_INLINE_SPIN_LOCK=y
+CONFIG_ARCH_INLINE_SPIN_LOCK_BH=y
+CONFIG_ARCH_INLINE_SPIN_LOCK_IRQ=y
+CONFIG_ARCH_INLINE_SPIN_LOCK_IRQSAVE=y
+CONFIG_ARCH_INLINE_SPIN_UNLOCK=y
+CONFIG_ARCH_INLINE_SPIN_UNLOCK_BH=y
+CONFIG_ARCH_INLINE_SPIN_UNLOCK_IRQ=y
+CONFIG_ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE=y
+CONFIG_ARCH_INLINE_READ_LOCK=y
+CONFIG_ARCH_INLINE_READ_LOCK_BH=y
+CONFIG_ARCH_INLINE_READ_LOCK_IRQ=y
+CONFIG_ARCH_INLINE_READ_LOCK_IRQSAVE=y
+CONFIG_ARCH_INLINE_READ_UNLOCK=y
+CONFIG_ARCH_INLINE_READ_UNLOCK_BH=y
+CONFIG_ARCH_INLINE_READ_UNLOCK_IRQ=y
+CONFIG_ARCH_INLINE_READ_UNLOCK_IRQRESTORE=y
+CONFIG_ARCH_INLINE_WRITE_LOCK=y
+CONFIG_ARCH_INLINE_WRITE_LOCK_BH=y
+CONFIG_ARCH_INLINE_WRITE_LOCK_IRQ=y
+CONFIG_ARCH_INLINE_WRITE_LOCK_IRQSAVE=y
+CONFIG_ARCH_INLINE_WRITE_UNLOCK=y
+CONFIG_ARCH_INLINE_WRITE_UNLOCK_BH=y
+CONFIG_ARCH_INLINE_WRITE_UNLOCK_IRQ=y
+CONFIG_ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE=y
+CONFIG_INLINE_SPIN_TRYLOCK=y
+CONFIG_INLINE_SPIN_TRYLOCK_BH=y
+CONFIG_INLINE_SPIN_LOCK=y
+CONFIG_INLINE_SPIN_LOCK_BH=y
+CONFIG_INLINE_SPIN_LOCK_IRQ=y
+CONFIG_INLINE_SPIN_LOCK_IRQSAVE=y
+CONFIG_INLINE_SPIN_UNLOCK_BH=y
+CONFIG_INLINE_SPIN_UNLOCK_IRQ=y
+CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE=y
+CONFIG_INLINE_READ_LOCK=y
+CONFIG_INLINE_READ_LOCK_BH=y
+CONFIG_INLINE_READ_LOCK_IRQ=y
+CONFIG_INLINE_READ_LOCK_IRQSAVE=y
+CONFIG_INLINE_READ_UNLOCK=y
+CONFIG_INLINE_READ_UNLOCK_BH=y
+CONFIG_INLINE_READ_UNLOCK_IRQ=y
+CONFIG_INLINE_READ_UNLOCK_IRQRESTORE=y
+CONFIG_INLINE_WRITE_LOCK=y
+CONFIG_INLINE_WRITE_LOCK_BH=y
+CONFIG_INLINE_WRITE_LOCK_IRQ=y
+CONFIG_INLINE_WRITE_LOCK_IRQSAVE=y
+CONFIG_INLINE_WRITE_UNLOCK=y
+CONFIG_INLINE_WRITE_UNLOCK_BH=y
+CONFIG_INLINE_WRITE_UNLOCK_IRQ=y
+CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE=y
+CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y
+CONFIG_MUTEX_SPIN_ON_OWNER=y
+CONFIG_RWSEM_SPIN_ON_OWNER=y
+CONFIG_LOCK_SPIN_ON_OWNER=y
+CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y
+CONFIG_QUEUED_SPINLOCKS=y
+CONFIG_ARCH_USE_QUEUED_RWLOCKS=y
+CONFIG_QUEUED_RWLOCKS=y
+CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y
+CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y
+CONFIG_FREEZER=y
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+CONFIG_COMPAT_BINFMT_ELF=y
+CONFIG_ARCH_BINFMT_ELF_STATE=y
+CONFIG_ARCH_BINFMT_ELF_EXTRA_PHDRS=y
+CONFIG_ARCH_HAVE_ELF_PROT=y
+CONFIG_ARCH_USE_GNU_PROPERTY=y
+CONFIG_ELFCORE=y
+CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y
+CONFIG_BINFMT_SCRIPT=y
+# CONFIG_BINFMT_MISC is not set
+CONFIG_COREDUMP=y
+# end of Executable file formats
+
+#
+# Memory Management options
+#
+CONFIG_SWAP=y
+# CONFIG_ZSWAP is not set
+
+#
+# SLAB allocator options
+#
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+CONFIG_SLAB_MERGE_DEFAULT=y
+# CONFIG_SLAB_FREELIST_RANDOM is not set
+# CONFIG_SLAB_FREELIST_HARDENED is not set
+# CONFIG_SLUB_STATS is not set
+CONFIG_SLUB_CPU_PARTIAL=y
+# end of SLAB allocator options
+
+# CONFIG_SHUFFLE_PAGE_ALLOCATOR is not set
+# CONFIG_COMPAT_BRK is not set
+CONFIG_SPARSEMEM=y
+CONFIG_SPARSEMEM_EXTREME=y
+CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y
+CONFIG_SPARSEMEM_VMEMMAP=y
+CONFIG_HAVE_FAST_GUP=y
+CONFIG_ARCH_KEEP_MEMBLOCK=y
+CONFIG_MEMORY_ISOLATION=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
+# CONFIG_MEMORY_HOTPLUG is not set
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y
+CONFIG_COMPACTION=y
+CONFIG_COMPACT_UNEVICTABLE_DEFAULT=1
+# CONFIG_PAGE_REPORTING is not set
+CONFIG_MIGRATION=y
+CONFIG_CONTIG_ALLOC=y
+CONFIG_PHYS_ADDR_T_64BIT=y
+# CONFIG_KSM is not set
+CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
+CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y
+# CONFIG_MEMORY_FAILURE is not set
+CONFIG_ARCH_WANTS_THP_SWAP=y
+# CONFIG_TRANSPARENT_HUGEPAGE is not set
+CONFIG_CMA=y
+# CONFIG_CMA_DEBUG is not set
+CONFIG_CMA_DEBUGFS=y
+# CONFIG_CMA_SYSFS is not set
+CONFIG_CMA_AREAS=7
+CONFIG_GENERIC_EARLY_IOREMAP=y
+# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set
+# CONFIG_IDLE_PAGE_TRACKING is not set
+CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
+CONFIG_ARCH_HAS_CURRENT_STACK_POINTER=y
+CONFIG_ARCH_HAS_PTE_DEVMAP=y
+CONFIG_ARCH_HAS_ZONE_DMA_SET=y
+CONFIG_ZONE_DMA=y
+CONFIG_ZONE_DMA32=y
+CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_PERCPU_STATS is not set
+# CONFIG_GUP_TEST is not set
+CONFIG_ARCH_HAS_PTE_SPECIAL=y
+# CONFIG_ANON_VMA_NAME is not set
+# CONFIG_USERFAULTFD is not set
+# CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
+
+#
+# Data Access Monitoring
+#
+# CONFIG_DAMON is not set
+# end of Data Access Monitoring
+# end of Memory Management options
+
+CONFIG_NET=y
+CONFIG_COMPAT_NETLINK_MESSAGES=y
+CONFIG_NET_EGRESS=y
+CONFIG_SKB_EXTENSIONS=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+# CONFIG_PACKET_DIAG is not set
+CONFIG_UNIX=y
+CONFIG_UNIX_SCM=y
+CONFIG_AF_UNIX_OOB=y
+# CONFIG_UNIX_DIAG is not set
+# CONFIG_TLS is not set
+CONFIG_XFRM=y
+CONFIG_XFRM_ALGO=y
+CONFIG_XFRM_USER=y
+# CONFIG_XFRM_INTERFACE is not set
+# CONFIG_XFRM_SUB_POLICY is not set
+# CONFIG_XFRM_MIGRATE is not set
+# CONFIG_XFRM_STATISTICS is not set
+CONFIG_XFRM_ESP=y
+# CONFIG_NET_KEY is not set
+# CONFIG_XDP_SOCKETS is not set
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+# CONFIG_IP_FIB_TRIE_STATS is not set
+CONFIG_IP_MULTIPLE_TABLES=y
+# CONFIG_IP_ROUTE_MULTIPATH is not set
+# CONFIG_IP_ROUTE_VERBOSE is not set
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE_DEMUX is not set
+CONFIG_NET_IP_TUNNEL=m
+CONFIG_IP_MROUTE_COMMON=y
+CONFIG_IP_MROUTE=y
+# CONFIG_IP_MROUTE_MULTIPLE_TABLES is not set
+# CONFIG_IP_PIMSM_V1 is not set
+# CONFIG_IP_PIMSM_V2 is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_NET_IPVTI is not set
+CONFIG_NET_UDP_TUNNEL=m
+CONFIG_NET_FOU=m
+# CONFIG_NET_FOU_IP_TUNNELS is not set
+# CONFIG_INET_AH is not set
+CONFIG_INET_ESP=y
+# CONFIG_INET_ESP_OFFLOAD is not set
+# CONFIG_INET_ESPINTCP is not set
+# CONFIG_INET_IPCOMP is not set
+CONFIG_INET_TABLE_PERTURB_ORDER=16
+CONFIG_INET_TUNNEL=m
+# CONFIG_INET_DIAG is not set
+CONFIG_TCP_CONG_ADVANCED=y
+# CONFIG_TCP_CONG_BIC is not set
+CONFIG_TCP_CONG_CUBIC=y
+# CONFIG_TCP_CONG_WESTWOOD is not set
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+# CONFIG_TCP_CONG_HYBLA is not set
+CONFIG_TCP_CONG_VEGAS=m
+# CONFIG_TCP_CONG_NV is not set
+CONFIG_TCP_CONG_SCALABLE=m
+# CONFIG_TCP_CONG_LP is not set
+CONFIG_TCP_CONG_VENO=m
+CONFIG_TCP_CONG_YEAH=m
+CONFIG_TCP_CONG_ILLINOIS=m
+# CONFIG_TCP_CONG_DCTCP is not set
+CONFIG_TCP_CONG_CDG=m
+# CONFIG_TCP_CONG_BBR is not set
+CONFIG_DEFAULT_CUBIC=y
+# CONFIG_DEFAULT_RENO is not set
+CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
+CONFIG_IPV6=y
+# CONFIG_IPV6_ROUTER_PREF is not set
+# CONFIG_IPV6_OPTIMISTIC_DAD is not set
+# CONFIG_INET6_AH is not set
+# CONFIG_INET6_ESP is not set
+# CONFIG_INET6_IPCOMP is not set
+# CONFIG_IPV6_MIP6 is not set
+# CONFIG_IPV6_ILA is not set
+# CONFIG_IPV6_VTI is not set
+CONFIG_IPV6_SIT=m
+# CONFIG_IPV6_SIT_6RD is not set
+CONFIG_IPV6_NDISC_NODETYPE=y
+# CONFIG_IPV6_TUNNEL is not set
+CONFIG_IPV6_FOU=m
+# CONFIG_IPV6_MULTIPLE_TABLES is not set
+# CONFIG_IPV6_MROUTE is not set
+# CONFIG_IPV6_SEG6_LWTUNNEL is not set
+# CONFIG_IPV6_SEG6_HMAC is not set
+# CONFIG_IPV6_RPL_LWTUNNEL is not set
+# CONFIG_IPV6_IOAM6_LWTUNNEL is not set
+# CONFIG_MPTCP is not set
+# CONFIG_NETWORK_SECMARK is not set
+# CONFIG_NETWORK_PHY_TIMESTAMPING is not set
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_ADVANCED=y
+CONFIG_BRIDGE_NETFILTER=m
+
+#
+# Core Netfilter Configuration
+#
+# CONFIG_NETFILTER_INGRESS is not set
+CONFIG_NETFILTER_EGRESS=y
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_FAMILY_BRIDGE=y
+# CONFIG_NETFILTER_NETLINK_ACCT is not set
+# CONFIG_NETFILTER_NETLINK_QUEUE is not set
+CONFIG_NETFILTER_NETLINK_LOG=m
+# CONFIG_NETFILTER_NETLINK_OSF is not set
+CONFIG_NF_CONNTRACK=m
+# CONFIG_NF_LOG_SYSLOG is not set
+# CONFIG_NF_CONNTRACK_MARK is not set
+# CONFIG_NF_CONNTRACK_ZONES is not set
+# CONFIG_NF_CONNTRACK_PROCFS is not set
+# CONFIG_NF_CONNTRACK_EVENTS is not set
+# CONFIG_NF_CONNTRACK_TIMEOUT is not set
+# CONFIG_NF_CONNTRACK_TIMESTAMP is not set
+# CONFIG_NF_CONNTRACK_LABELS is not set
+# CONFIG_NF_CT_PROTO_DCCP is not set
+# CONFIG_NF_CT_PROTO_SCTP is not set
+# CONFIG_NF_CT_PROTO_UDPLITE is not set
+# CONFIG_NF_CONNTRACK_AMANDA is not set
+CONFIG_NF_CONNTRACK_FTP=m
+# CONFIG_NF_CONNTRACK_H323 is not set
+CONFIG_NF_CONNTRACK_IRC=m
+CONFIG_NF_CONNTRACK_BROADCAST=m
+CONFIG_NF_CONNTRACK_NETBIOS_NS=m
+# CONFIG_NF_CONNTRACK_SNMP is not set
+# CONFIG_NF_CONNTRACK_PPTP is not set
+# CONFIG_NF_CONNTRACK_SANE is not set
+CONFIG_NF_CONNTRACK_SIP=m
+CONFIG_NF_CONNTRACK_TFTP=m
+CONFIG_NF_CT_NETLINK=m
+# CONFIG_NETFILTER_NETLINK_GLUE_CT is not set
+CONFIG_NF_NAT=m
+CONFIG_NF_NAT_FTP=m
+CONFIG_NF_NAT_IRC=m
+CONFIG_NF_NAT_SIP=m
+CONFIG_NF_NAT_TFTP=m
+CONFIG_NF_NAT_REDIRECT=y
+CONFIG_NF_NAT_MASQUERADE=y
+# CONFIG_NF_TABLES is not set
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XTABLES_COMPAT=y
+
+#
+# Xtables combined modules
+#
+CONFIG_NETFILTER_XT_MARK=m
+# CONFIG_NETFILTER_XT_CONNMARK is not set
+
+#
+# Xtables targets
+#
+# CONFIG_NETFILTER_XT_TARGET_CHECKSUM is not set
+# CONFIG_NETFILTER_XT_TARGET_CLASSIFY is not set
+# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set
+# CONFIG_NETFILTER_XT_TARGET_DSCP is not set
+# CONFIG_NETFILTER_XT_TARGET_HL is not set
+# CONFIG_NETFILTER_XT_TARGET_HMARK is not set
+# CONFIG_NETFILTER_XT_TARGET_IDLETIMER is not set
+# CONFIG_NETFILTER_XT_TARGET_LED is not set
+# CONFIG_NETFILTER_XT_TARGET_LOG is not set
+# CONFIG_NETFILTER_XT_TARGET_MARK is not set
+CONFIG_NETFILTER_XT_NAT=m
+# CONFIG_NETFILTER_XT_TARGET_NETMAP is not set
+# CONFIG_NETFILTER_XT_TARGET_NFLOG is not set
+# CONFIG_NETFILTER_XT_TARGET_NFQUEUE is not set
+# CONFIG_NETFILTER_XT_TARGET_RATEEST is not set
+CONFIG_NETFILTER_XT_TARGET_REDIRECT=m
+CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m
+# CONFIG_NETFILTER_XT_TARGET_TEE is not set
+# CONFIG_NETFILTER_XT_TARGET_TPROXY is not set
+# CONFIG_NETFILTER_XT_TARGET_TCPMSS is not set
+# CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP is not set
+
+#
+# Xtables matches
+#
+CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m
+# CONFIG_NETFILTER_XT_MATCH_BPF is not set
+# CONFIG_NETFILTER_XT_MATCH_CGROUP is not set
+# CONFIG_NETFILTER_XT_MATCH_CLUSTER is not set
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+# CONFIG_NETFILTER_XT_MATCH_CONNBYTES is not set
+# CONFIG_NETFILTER_XT_MATCH_CONNLABEL is not set
+# CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set
+# CONFIG_NETFILTER_XT_MATCH_CONNMARK is not set
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+# CONFIG_NETFILTER_XT_MATCH_CPU is not set
+# CONFIG_NETFILTER_XT_MATCH_DCCP is not set
+# CONFIG_NETFILTER_XT_MATCH_DEVGROUP is not set
+# CONFIG_NETFILTER_XT_MATCH_DSCP is not set
+# CONFIG_NETFILTER_XT_MATCH_ECN is not set
+# CONFIG_NETFILTER_XT_MATCH_ESP is not set
+# CONFIG_NETFILTER_XT_MATCH_HASHLIMIT is not set
+# CONFIG_NETFILTER_XT_MATCH_HELPER is not set
+# CONFIG_NETFILTER_XT_MATCH_HL is not set
+# CONFIG_NETFILTER_XT_MATCH_IPCOMP is not set
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=m
+CONFIG_NETFILTER_XT_MATCH_IPVS=m
+# CONFIG_NETFILTER_XT_MATCH_L2TP is not set
+# CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
+# CONFIG_NETFILTER_XT_MATCH_LIMIT is not set
+# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+# CONFIG_NETFILTER_XT_MATCH_MARK is not set
+# CONFIG_NETFILTER_XT_MATCH_MULTIPORT is not set
+# CONFIG_NETFILTER_XT_MATCH_NFACCT is not set
+# CONFIG_NETFILTER_XT_MATCH_OSF is not set
+CONFIG_NETFILTER_XT_MATCH_OWNER=m
+# CONFIG_NETFILTER_XT_MATCH_POLICY is not set
+# CONFIG_NETFILTER_XT_MATCH_PHYSDEV is not set
+# CONFIG_NETFILTER_XT_MATCH_PKTTYPE is not set
+# CONFIG_NETFILTER_XT_MATCH_QUOTA is not set
+# CONFIG_NETFILTER_XT_MATCH_RATEEST is not set
+# CONFIG_NETFILTER_XT_MATCH_REALM is not set
+# CONFIG_NETFILTER_XT_MATCH_RECENT is not set
+# CONFIG_NETFILTER_XT_MATCH_SCTP is not set
+# CONFIG_NETFILTER_XT_MATCH_SOCKET is not set
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+# CONFIG_NETFILTER_XT_MATCH_STATISTIC is not set
+# CONFIG_NETFILTER_XT_MATCH_STRING is not set
+# CONFIG_NETFILTER_XT_MATCH_TCPMSS is not set
+# CONFIG_NETFILTER_XT_MATCH_TIME is not set
+# CONFIG_NETFILTER_XT_MATCH_U32 is not set
+# end of Core Netfilter Configuration
+
+# CONFIG_IP_SET is not set
+CONFIG_IP_VS=m
+# CONFIG_IP_VS_IPV6 is not set
+# CONFIG_IP_VS_DEBUG is not set
+CONFIG_IP_VS_TAB_BITS=12
+
+#
+# IPVS transport protocol load balancing support
+#
+CONFIG_IP_VS_PROTO_TCP=y
+CONFIG_IP_VS_PROTO_UDP=y
+# CONFIG_IP_VS_PROTO_ESP is not set
+# CONFIG_IP_VS_PROTO_AH is not set
+# CONFIG_IP_VS_PROTO_SCTP is not set
+
+#
+# IPVS scheduler
+#
+CONFIG_IP_VS_RR=m
+# CONFIG_IP_VS_WRR is not set
+# CONFIG_IP_VS_LC is not set
+# CONFIG_IP_VS_WLC is not set
+# CONFIG_IP_VS_FO is not set
+# CONFIG_IP_VS_OVF is not set
+# CONFIG_IP_VS_LBLC is not set
+# CONFIG_IP_VS_LBLCR is not set
+# CONFIG_IP_VS_DH is not set
+# CONFIG_IP_VS_SH is not set
+# CONFIG_IP_VS_MH is not set
+# CONFIG_IP_VS_SED is not set
+# CONFIG_IP_VS_NQ is not set
+# CONFIG_IP_VS_TWOS is not set
+
+#
+# IPVS SH scheduler
+#
+CONFIG_IP_VS_SH_TAB_BITS=8
+
+#
+# IPVS MH scheduler
+#
+CONFIG_IP_VS_MH_TAB_INDEX=12
+
+#
+# IPVS application helper
+#
+# CONFIG_IP_VS_FTP is not set
+CONFIG_IP_VS_NFCT=y
+# CONFIG_IP_VS_PE_SIP is not set
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_NF_DEFRAG_IPV4=m
+# CONFIG_NF_SOCKET_IPV4 is not set
+# CONFIG_NF_TPROXY_IPV4 is not set
+# CONFIG_NF_DUP_IPV4 is not set
+# CONFIG_NF_LOG_ARP is not set
+# CONFIG_NF_LOG_IPV4 is not set
+CONFIG_NF_REJECT_IPV4=m
+CONFIG_IP_NF_IPTABLES=m
+# CONFIG_IP_NF_MATCH_AH is not set
+# CONFIG_IP_NF_MATCH_ECN is not set
+# CONFIG_IP_NF_MATCH_RPFILTER is not set
+# CONFIG_IP_NF_MATCH_TTL is not set
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+# CONFIG_IP_NF_TARGET_SYNPROXY is not set
+CONFIG_IP_NF_NAT=m
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+# CONFIG_IP_NF_TARGET_NETMAP is not set
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_MANGLE=m
+# CONFIG_IP_NF_TARGET_CLUSTERIP is not set
+# CONFIG_IP_NF_TARGET_ECN is not set
+# CONFIG_IP_NF_TARGET_TTL is not set
+# CONFIG_IP_NF_RAW is not set
+# CONFIG_IP_NF_ARPTABLES is not set
+# end of IP: Netfilter Configuration
+
+#
+# IPv6: Netfilter Configuration
+#
+# CONFIG_NF_SOCKET_IPV6 is not set
+# CONFIG_NF_TPROXY_IPV6 is not set
+# CONFIG_NF_DUP_IPV6 is not set
+CONFIG_NF_REJECT_IPV6=m
+# CONFIG_NF_LOG_IPV6 is not set
+CONFIG_IP6_NF_IPTABLES=m
+# CONFIG_IP6_NF_MATCH_AH is not set
+# CONFIG_IP6_NF_MATCH_EUI64 is not set
+# CONFIG_IP6_NF_MATCH_FRAG is not set
+# CONFIG_IP6_NF_MATCH_OPTS is not set
+# CONFIG_IP6_NF_MATCH_HL is not set
+# CONFIG_IP6_NF_MATCH_IPV6HEADER is not set
+# CONFIG_IP6_NF_MATCH_MH is not set
+# CONFIG_IP6_NF_MATCH_RPFILTER is not set
+# CONFIG_IP6_NF_MATCH_RT is not set
+# CONFIG_IP6_NF_MATCH_SRH is not set
+# CONFIG_IP6_NF_TARGET_HL is not set
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+# CONFIG_IP6_NF_TARGET_SYNPROXY is not set
+CONFIG_IP6_NF_MANGLE=m
+# CONFIG_IP6_NF_RAW is not set
+CONFIG_IP6_NF_NAT=m
+CONFIG_IP6_NF_TARGET_MASQUERADE=m
+# CONFIG_IP6_NF_TARGET_NPT is not set
+# end of IPv6: Netfilter Configuration
+
+CONFIG_NF_DEFRAG_IPV6=m
+# CONFIG_NF_CONNTRACK_BRIDGE is not set
+# CONFIG_BRIDGE_NF_EBTABLES is not set
+# CONFIG_BPFILTER is not set
+# CONFIG_IP_DCCP is not set
+# CONFIG_IP_SCTP is not set
+# CONFIG_RDS is not set
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+# CONFIG_L2TP is not set
+CONFIG_STP=m
+CONFIG_BRIDGE=m
+CONFIG_BRIDGE_IGMP_SNOOPING=y
+CONFIG_BRIDGE_VLAN_FILTERING=y
+# CONFIG_BRIDGE_MRP is not set
+# CONFIG_BRIDGE_CFM is not set
+# CONFIG_NET_DSA is not set
+CONFIG_VLAN_8021Q=m
+# CONFIG_VLAN_8021Q_GVRP is not set
+# CONFIG_VLAN_8021Q_MVRP is not set
+CONFIG_LLC=m
+# CONFIG_LLC2 is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_PHONET is not set
+# CONFIG_6LOWPAN is not set
+# CONFIG_IEEE802154 is not set
+CONFIG_NET_SCHED=y
+
+#
+# Queueing/Scheduling
+#
+# CONFIG_NET_SCH_CBQ is not set
+# CONFIG_NET_SCH_HTB is not set
+# CONFIG_NET_SCH_HFSC is not set
+# CONFIG_NET_SCH_PRIO is not set
+# CONFIG_NET_SCH_MULTIQ is not set
+# CONFIG_NET_SCH_RED is not set
+# CONFIG_NET_SCH_SFB is not set
+# CONFIG_NET_SCH_SFQ is not set
+# CONFIG_NET_SCH_TEQL is not set
+# CONFIG_NET_SCH_TBF is not set
+# CONFIG_NET_SCH_CBS is not set
+# CONFIG_NET_SCH_ETF is not set
+# CONFIG_NET_SCH_TAPRIO is not set
+# CONFIG_NET_SCH_GRED is not set
+# CONFIG_NET_SCH_DSMARK is not set
+# CONFIG_NET_SCH_NETEM is not set
+# CONFIG_NET_SCH_DRR is not set
+# CONFIG_NET_SCH_MQPRIO is not set
+# CONFIG_NET_SCH_SKBPRIO is not set
+# CONFIG_NET_SCH_CHOKE is not set
+# CONFIG_NET_SCH_QFQ is not set
+# CONFIG_NET_SCH_CODEL is not set
+CONFIG_NET_SCH_FQ_CODEL=y
+# CONFIG_NET_SCH_CAKE is not set
+# CONFIG_NET_SCH_FQ is not set
+# CONFIG_NET_SCH_HHF is not set
+# CONFIG_NET_SCH_PIE is not set
+# CONFIG_NET_SCH_PLUG is not set
+# CONFIG_NET_SCH_ETS is not set
+# CONFIG_NET_SCH_DEFAULT is not set
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+# CONFIG_NET_CLS_BASIC is not set
+# CONFIG_NET_CLS_ROUTE4 is not set
+# CONFIG_NET_CLS_FW is not set
+# CONFIG_NET_CLS_U32 is not set
+# CONFIG_NET_CLS_FLOW is not set
+CONFIG_NET_CLS_CGROUP=m
+# CONFIG_NET_CLS_BPF is not set
+# CONFIG_NET_CLS_FLOWER is not set
+# CONFIG_NET_CLS_MATCHALL is not set
+# CONFIG_NET_EMATCH is not set
+# CONFIG_NET_CLS_ACT is not set
+CONFIG_NET_SCH_FIFO=y
+# CONFIG_DCB is not set
+CONFIG_DNS_RESOLVER=y
+# CONFIG_BATMAN_ADV is not set
+# CONFIG_OPENVSWITCH is not set
+# CONFIG_VSOCKETS is not set
+# CONFIG_NETLINK_DIAG is not set
+# CONFIG_MPLS is not set
+# CONFIG_NET_NSH is not set
+# CONFIG_HSR is not set
+# CONFIG_NET_SWITCHDEV is not set
+CONFIG_NET_L3_MASTER_DEV=y
+# CONFIG_QRTR is not set
+# CONFIG_NET_NCSI is not set
+CONFIG_PCPU_DEV_REFCNT=y
+CONFIG_RPS=y
+CONFIG_RFS_ACCEL=y
+CONFIG_SOCK_RX_QUEUE_MAPPING=y
+CONFIG_XPS=y
+CONFIG_CGROUP_NET_PRIO=y
+CONFIG_CGROUP_NET_CLASSID=y
+CONFIG_NET_RX_BUSY_POLL=y
+CONFIG_BQL=y
+# CONFIG_BPF_STREAM_PARSER is not set
+CONFIG_NET_FLOW_LIMIT=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_NET_DROP_MONITOR is not set
+# end of Network testing
+# end of Networking options
+
+# CONFIG_HAMRADIO is not set
+# CONFIG_CAN is not set
+CONFIG_BT=m
+CONFIG_BT_BREDR=y
+CONFIG_BT_RFCOMM=m
+CONFIG_BT_RFCOMM_TTY=y
+# CONFIG_BT_BNEP is not set
+CONFIG_BT_HIDP=m
+CONFIG_BT_HS=y
+CONFIG_BT_LE=y
+# CONFIG_BT_LEDS is not set
+# CONFIG_BT_MSFTEXT is not set
+# CONFIG_BT_AOSPEXT is not set
+# CONFIG_BT_DEBUGFS is not set
+# CONFIG_BT_SELFTEST is not set
+
+#
+# Bluetooth device drivers
+#
+CONFIG_BT_INTEL=m
+CONFIG_BT_BCM=m
+CONFIG_BT_RTL=m
+CONFIG_BT_MTK=m
+CONFIG_BT_HCIBTUSB=m
+# CONFIG_BT_HCIBTUSB_AUTOSUSPEND is not set
+CONFIG_BT_HCIBTUSB_BCM=y
+CONFIG_BT_HCIBTUSB_MTK=y
+CONFIG_BT_HCIBTUSB_RTL=y
+# CONFIG_BT_HCIBTSDIO is not set
+CONFIG_BT_HCIUART=m
+CONFIG_BT_HCIUART_SERDEV=y
+CONFIG_BT_HCIUART_H4=y
+# CONFIG_BT_HCIUART_NOKIA is not set
+# CONFIG_BT_HCIUART_BCSP is not set
+# CONFIG_BT_HCIUART_ATH3K is not set
+# CONFIG_BT_HCIUART_LL is not set
+CONFIG_BT_HCIUART_3WIRE=y
+# CONFIG_BT_HCIUART_INTEL is not set
+CONFIG_BT_HCIUART_BCM=y
+# CONFIG_BT_HCIUART_RTL is not set
+# CONFIG_BT_HCIUART_QCA is not set
+# CONFIG_BT_HCIUART_AG6XX is not set
+# CONFIG_BT_HCIUART_MRVL is not set
+CONFIG_BT_HCIBCM203X=m
+# CONFIG_BT_HCIBPA10X is not set
+CONFIG_BT_HCIBFUSB=m
+# CONFIG_BT_HCIVHCI is not set
+# CONFIG_BT_MRVL is not set
+CONFIG_BT_ATH3K=m
+# CONFIG_BT_MTKSDIO is not set
+# CONFIG_BT_MTKUART is not set
+# end of Bluetooth device drivers
+
+# CONFIG_AF_RXRPC is not set
+# CONFIG_AF_KCM is not set
+# CONFIG_MCTP is not set
+CONFIG_FIB_RULES=y
+CONFIG_WIRELESS=y
+CONFIG_WIRELESS_EXT=y
+CONFIG_WEXT_CORE=y
+CONFIG_WEXT_PROC=y
+CONFIG_WEXT_PRIV=y
+CONFIG_CFG80211=m
+# CONFIG_NL80211_TESTMODE is not set
+# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set
+# CONFIG_CFG80211_CERTIFICATION_ONUS is not set
+CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y
+CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y
+CONFIG_CFG80211_DEFAULT_PS=y
+# CONFIG_CFG80211_DEBUGFS is not set
+# CONFIG_CFG80211_CRDA_SUPPORT is not set
+CONFIG_CFG80211_WEXT=y
+CONFIG_LIB80211=m
+CONFIG_LIB80211_CRYPT_WEP=m
+CONFIG_LIB80211_CRYPT_CCMP=m
+# CONFIG_LIB80211_DEBUG is not set
+CONFIG_MAC80211=m
+CONFIG_MAC80211_HAS_RC=y
+CONFIG_MAC80211_RC_MINSTREL=y
+CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y
+CONFIG_MAC80211_RC_DEFAULT="minstrel_ht"
+# CONFIG_MAC80211_MESH is not set
+CONFIG_MAC80211_LEDS=y
+# CONFIG_MAC80211_DEBUGFS is not set
+# CONFIG_MAC80211_MESSAGE_TRACING is not set
+# CONFIG_MAC80211_DEBUG_MENU is not set
+CONFIG_MAC80211_STA_HASH_MAX_SIZE=0
+CONFIG_RFKILL=m
+CONFIG_RFKILL_LEDS=y
+CONFIG_RFKILL_INPUT=y
+# CONFIG_RFKILL_GPIO is not set
+# CONFIG_NET_9P is not set
+# CONFIG_CAIF is not set
+# CONFIG_CEPH_LIB is not set
+# CONFIG_NFC is not set
+# CONFIG_PSAMPLE is not set
+# CONFIG_NET_IFE is not set
+# CONFIG_LWTUNNEL is not set
+CONFIG_DST_CACHE=y
+CONFIG_GRO_CELLS=y
+CONFIG_NET_SELFTESTS=y
+CONFIG_NET_SOCK_MSG=y
+CONFIG_PAGE_POOL=y
+# CONFIG_PAGE_POOL_STATS is not set
+# CONFIG_FAILOVER is not set
+CONFIG_ETHTOOL_NETLINK=y
+
+#
+# Device Drivers
+#
+CONFIG_ARM_AMBA=y
+CONFIG_HAVE_PCI=y
+CONFIG_PCI=y
+CONFIG_PCI_DOMAINS=y
+CONFIG_PCI_DOMAINS_GENERIC=y
+CONFIG_PCI_SYSCALL=y
+CONFIG_PCIEPORTBUS=y
+CONFIG_PCIEAER=y
+# CONFIG_PCIEAER_INJECT is not set
+# CONFIG_PCIE_ECRC is not set
+CONFIG_PCIEASPM=y
+# CONFIG_PCIEASPM_DEFAULT is not set
+CONFIG_PCIEASPM_POWERSAVE=y
+# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set
+# CONFIG_PCIEASPM_PERFORMANCE is not set
+CONFIG_PCIE_PME=y
+CONFIG_PCIE_DPC=y
+# CONFIG_PCIE_PTM is not set
+CONFIG_PCI_MSI=y
+CONFIG_PCI_MSI_IRQ_DOMAIN=y
+CONFIG_PCI_QUIRKS=y
+# CONFIG_PCI_DEBUG is not set
+# CONFIG_PCI_STUB is not set
+# CONFIG_PCI_IOV is not set
+# CONFIG_PCI_PRI is not set
+# CONFIG_PCI_PASID is not set
+CONFIG_PCI_LABEL=y
+# CONFIG_PCIE_BUS_TUNE_OFF is not set
+CONFIG_PCIE_BUS_DEFAULT=y
+# CONFIG_PCIE_BUS_SAFE is not set
+# CONFIG_PCIE_BUS_PERFORMANCE is not set
+# CONFIG_PCIE_BUS_PEER2PEER is not set
+CONFIG_VGA_ARB=y
+CONFIG_VGA_ARB_MAX_GPUS=16
+# CONFIG_HOTPLUG_PCI is not set
+
+#
+# PCI controller drivers
+#
+# CONFIG_PCI_FTPCI100 is not set
+# CONFIG_PCI_HOST_GENERIC is not set
+# CONFIG_PCIE_XILINX is not set
+# CONFIG_PCI_XGENE is not set
+# CONFIG_PCIE_ALTERA is not set
+# CONFIG_PCI_HOST_THUNDER_PEM is not set
+# CONFIG_PCI_HOST_THUNDER_ECAM is not set
+CONFIG_PCIE_BRCMSTB=y
+# CONFIG_PCIE_MICROCHIP_HOST is not set
+
+#
+# DesignWare PCI Core Support
+#
+# CONFIG_PCIE_DW_PLAT_HOST is not set
+# CONFIG_PCI_HISI is not set
+# CONFIG_PCIE_KIRIN is not set
+# CONFIG_PCI_MESON is not set
+# CONFIG_PCIE_AL is not set
+# end of DesignWare PCI Core Support
+
+#
+# Mobiveil PCIe Core Support
+#
+# end of Mobiveil PCIe Core Support
+
+#
+# Cadence PCIe controllers support
+#
+# CONFIG_PCIE_CADENCE_PLAT_HOST is not set
+# CONFIG_PCI_J721E_HOST is not set
+# end of Cadence PCIe controllers support
+# end of PCI controller drivers
+
+#
+# PCI Endpoint
+#
+# CONFIG_PCI_ENDPOINT is not set
+# end of PCI Endpoint
+
+#
+# PCI switch controller drivers
+#
+# CONFIG_PCI_SW_SWITCHTEC is not set
+# end of PCI switch controller drivers
+
+# CONFIG_CXL_BUS is not set
+# CONFIG_PCCARD is not set
+# CONFIG_RAPIDIO is not set
+
+#
+# Generic Driver Options
+#
+# CONFIG_UEVENT_HELPER is not set
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+# CONFIG_DEVTMPFS_SAFE is not set
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+
+#
+# Firmware loader
+#
+CONFIG_FW_LOADER=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_FW_LOADER_USER_HELPER is not set
+# CONFIG_FW_LOADER_COMPRESS is not set
+# CONFIG_FW_UPLOAD is not set
+# end of Firmware loader
+
+CONFIG_WANT_DEV_COREDUMP=y
+# CONFIG_ALLOW_DEV_COREDUMP is not set
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set
+# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set
+CONFIG_GENERIC_CPU_AUTOPROBE=y
+CONFIG_GENERIC_CPU_VULNERABILITIES=y
+CONFIG_SOC_BUS=y
+CONFIG_REGMAP=y
+CONFIG_REGMAP_I2C=y
+CONFIG_REGMAP_SPI=y
+CONFIG_REGMAP_MMIO=y
+CONFIG_REGMAP_IRQ=y
+CONFIG_DMA_SHARED_BUFFER=y
+# CONFIG_DMA_FENCE_TRACE is not set
+CONFIG_GENERIC_ARCH_TOPOLOGY=y
+# end of Generic Driver Options
+
+#
+# Bus devices
+#
+# CONFIG_BRCMSTB_GISB_ARB is not set
+# CONFIG_MOXTET is not set
+# CONFIG_VEXPRESS_CONFIG is not set
+# CONFIG_MHI_BUS is not set
+# CONFIG_MHI_BUS_EP is not set
+# end of Bus devices
+
+# CONFIG_CONNECTOR is not set
+
+#
+# Firmware Drivers
+#
+
+#
+# ARM System Control and Management Interface Protocol
+#
+# CONFIG_ARM_SCMI_PROTOCOL is not set
+# end of ARM System Control and Management Interface Protocol
+
+# CONFIG_ARM_SCPI_PROTOCOL is not set
+# CONFIG_FIRMWARE_MEMMAP is not set
+CONFIG_DMIID=y
+# CONFIG_DMI_SYSFS is not set
+CONFIG_RASPBERRYPI_FIRMWARE=y
+# CONFIG_FW_CFG_SYSFS is not set
+# CONFIG_SYSFB_SIMPLEFB is not set
+# CONFIG_ARM_FFA_TRANSPORT is not set
+CONFIG_CS_DSP=m
+# CONFIG_GOOGLE_FIRMWARE is not set
+
+#
+# EFI (Extensible Firmware Interface) Support
+#
+CONFIG_EFI_ESRT=y
+# CONFIG_EFI_VARS_PSTORE is not set
+CONFIG_EFI_PARAMS_FROM_FDT=y
+CONFIG_EFI_RUNTIME_WRAPPERS=y
+CONFIG_EFI_GENERIC_STUB=y
+# CONFIG_EFI_ZBOOT is not set
+CONFIG_EFI_ARMSTUB_DTB_LOADER=y
+CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y
+# CONFIG_EFI_BOOTLOADER_CONTROL is not set
+# CONFIG_EFI_CAPSULE_LOADER is not set
+# CONFIG_EFI_TEST is not set
+# CONFIG_RESET_ATTACK_MITIGATION is not set
+# CONFIG_EFI_DISABLE_PCI_DMA is not set
+CONFIG_EFI_EARLYCON=y
+# CONFIG_EFI_DISABLE_RUNTIME is not set
+# CONFIG_EFI_COCO_SECRET is not set
+# end of EFI (Extensible Firmware Interface) Support
+
+CONFIG_ARM_PSCI_FW=y
+CONFIG_HAVE_ARM_SMCCC=y
+CONFIG_HAVE_ARM_SMCCC_DISCOVERY=y
+CONFIG_ARM_SMCCC_SOC_ID=y
+
+#
+# Tegra firmware driver
+#
+# end of Tegra firmware driver
+# end of Firmware Drivers
+
+# CONFIG_GNSS is not set
+# CONFIG_MTD is not set
+CONFIG_DTC=y
+CONFIG_OF=y
+# CONFIG_OF_UNITTEST is not set
+CONFIG_OF_FLATTREE=y
+CONFIG_OF_EARLY_FLATTREE=y
+CONFIG_OF_KOBJ=y
+CONFIG_OF_DYNAMIC=y
+CONFIG_OF_ADDRESS=y
+CONFIG_OF_IRQ=y
+CONFIG_OF_RESERVED_MEM=y
+CONFIG_OF_RESOLVE=y
+CONFIG_OF_OVERLAY=y
+CONFIG_OF_CONFIGFS=y
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_NULL_BLK is not set
+CONFIG_CDROM=y
+# CONFIG_BLK_DEV_PCIESSD_MTIP32XX is not set
+# CONFIG_ZRAM is not set
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_LOOP_MIN_COUNT=0
+# CONFIG_BLK_DEV_DRBD is not set
+CONFIG_BLK_DEV_NBD=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+# CONFIG_BLK_DEV_RBD is not set
+# CONFIG_BLK_DEV_UBLK is not set
+
+#
+# NVME Support
+#
+CONFIG_NVME_CORE=y
+CONFIG_BLK_DEV_NVME=y
+# CONFIG_NVME_MULTIPATH is not set
+# CONFIG_NVME_VERBOSE_ERRORS is not set
+# CONFIG_NVME_HWMON is not set
+# CONFIG_NVME_FC is not set
+# CONFIG_NVME_TCP is not set
+# CONFIG_NVME_AUTH is not set
+# CONFIG_NVME_TARGET is not set
+# end of NVME Support
+
+#
+# Misc devices
+#
+CONFIG_BCM2835_SMI=m
+# CONFIG_AD525X_DPOT is not set
+# CONFIG_DUMMY_IRQ is not set
+# CONFIG_PHANTOM is not set
+# CONFIG_TIFM_CORE is not set
+# CONFIG_ICS932S401 is not set
+# CONFIG_ENCLOSURE_SERVICES is not set
+# CONFIG_HP_ILO is not set
+# CONFIG_APDS9802ALS is not set
+# CONFIG_ISL29003 is not set
+# CONFIG_ISL29020 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_SENSORS_BH1770 is not set
+# CONFIG_SENSORS_APDS990X is not set
+# CONFIG_HMC6352 is not set
+# CONFIG_DS1682 is not set
+# CONFIG_LATTICE_ECP3_CONFIG is not set
+# CONFIG_SRAM is not set
+# CONFIG_DW_XDATA_PCIE is not set
+# CONFIG_PCI_ENDPOINT_TEST is not set
+# CONFIG_XILINX_SDFEC is not set
+CONFIG_MISC_RTSX=y
+# CONFIG_HISI_HIKEY_USB is not set
+# CONFIG_OPEN_DICE is not set
+# CONFIG_VCPU_STALL_DETECTOR is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_AT25 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_MAX6875 is not set
+CONFIG_EEPROM_93CX6=m
+# CONFIG_EEPROM_93XX46 is not set
+# CONFIG_EEPROM_IDT_89HPESX is not set
+# CONFIG_EEPROM_EE1004 is not set
+# end of EEPROM support
+
+# CONFIG_CB710_CORE is not set
+
+#
+# Texas Instruments shared transport line discipline
+#
+# CONFIG_TI_ST is not set
+# end of Texas Instruments shared transport line discipline
+
+# CONFIG_SENSORS_LIS3_SPI is not set
+# CONFIG_SENSORS_LIS3_I2C is not set
+# CONFIG_ALTERA_STAPL is not set
+# CONFIG_VMWARE_VMCI is not set
+# CONFIG_GENWQE is not set
+# CONFIG_ECHO is not set
+# CONFIG_BCM_VK is not set
+# CONFIG_MISC_ALCOR_PCI is not set
+# CONFIG_MISC_RTSX_PCI is not set
+CONFIG_MISC_RTSX_USB=y
+# CONFIG_HABANA_AI is not set
+# CONFIG_UACCE is not set
+# CONFIG_PVPANIC is not set
+# CONFIG_GP_PCI1XXXX is not set
+# end of Misc devices
+
+#
+# SCSI device support
+#
+CONFIG_SCSI_MOD=y
+# CONFIG_RAID_ATTRS is not set
+CONFIG_SCSI_COMMON=y
+CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
+# CONFIG_SCSI_PROC_FS is not set
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=y
+# CONFIG_CHR_DEV_ST is not set
+CONFIG_BLK_DEV_SR=y
+CONFIG_CHR_DEV_SG=m
+CONFIG_BLK_DEV_BSG=y
+# CONFIG_CHR_DEV_SCH is not set
+# CONFIG_SCSI_CONSTANTS is not set
+# CONFIG_SCSI_LOGGING is not set
+# CONFIG_SCSI_SCAN_ASYNC is not set
+
+#
+# SCSI Transports
+#
+# CONFIG_SCSI_SPI_ATTRS is not set
+# CONFIG_SCSI_FC_ATTRS is not set
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_SAS_ATTRS is not set
+# CONFIG_SCSI_SAS_LIBSAS is not set
+# CONFIG_SCSI_SRP_ATTRS is not set
+# end of SCSI Transports
+
+CONFIG_SCSI_LOWLEVEL=y
+CONFIG_ISCSI_TCP=y
+CONFIG_ISCSI_BOOT_SYSFS=y
+# CONFIG_SCSI_CXGB3_ISCSI is not set
+# CONFIG_SCSI_CXGB4_ISCSI is not set
+# CONFIG_SCSI_BNX2_ISCSI is not set
+# CONFIG_BE2ISCSI is not set
+# CONFIG_BLK_DEV_3W_XXXX_RAID is not set
+# CONFIG_SCSI_HPSA is not set
+# CONFIG_SCSI_3W_9XXX is not set
+# CONFIG_SCSI_3W_SAS is not set
+# CONFIG_SCSI_ACARD is not set
+# CONFIG_SCSI_AACRAID is not set
+# CONFIG_SCSI_AIC7XXX is not set
+# CONFIG_SCSI_AIC79XX is not set
+# CONFIG_SCSI_AIC94XX is not set
+# CONFIG_SCSI_MVSAS is not set
+# CONFIG_SCSI_MVUMI is not set
+# CONFIG_SCSI_ADVANSYS is not set
+# CONFIG_SCSI_ARCMSR is not set
+# CONFIG_SCSI_ESAS2R is not set
+# CONFIG_MEGARAID_NEWGEN is not set
+# CONFIG_MEGARAID_LEGACY is not set
+# CONFIG_MEGARAID_SAS is not set
+# CONFIG_SCSI_MPT3SAS is not set
+# CONFIG_SCSI_MPT2SAS is not set
+# CONFIG_SCSI_MPI3MR is not set
+# CONFIG_SCSI_SMARTPQI is not set
+# CONFIG_SCSI_HPTIOP is not set
+# CONFIG_SCSI_BUSLOGIC is not set
+# CONFIG_SCSI_MYRB is not set
+# CONFIG_SCSI_MYRS is not set
+# CONFIG_SCSI_SNIC is not set
+# CONFIG_SCSI_DMX3191D is not set
+# CONFIG_SCSI_FDOMAIN_PCI is not set
+# CONFIG_SCSI_IPS is not set
+# CONFIG_SCSI_INITIO is not set
+# CONFIG_SCSI_INIA100 is not set
+# CONFIG_SCSI_STEX is not set
+# CONFIG_SCSI_SYM53C8XX_2 is not set
+# CONFIG_SCSI_QLOGIC_1280 is not set
+# CONFIG_SCSI_QLA_ISCSI is not set
+# CONFIG_SCSI_DC395x is not set
+# CONFIG_SCSI_AM53C974 is not set
+# CONFIG_SCSI_WD719X is not set
+# CONFIG_SCSI_DEBUG is not set
+# CONFIG_SCSI_PMCRAID is not set
+# CONFIG_SCSI_PM8001 is not set
+# CONFIG_SCSI_DH is not set
+# end of SCSI device support
+
+# CONFIG_ATA is not set
+CONFIG_MD=y
+# CONFIG_BLK_DEV_MD is not set
+# CONFIG_BCACHE is not set
+CONFIG_BLK_DEV_DM_BUILTIN=y
+CONFIG_BLK_DEV_DM=m
+# CONFIG_DM_DEBUG is not set
+CONFIG_DM_BUFIO=m
+# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set
+CONFIG_DM_BIO_PRISON=m
+CONFIG_DM_PERSISTENT_DATA=m
+# CONFIG_DM_UNSTRIPED is not set
+# CONFIG_DM_CRYPT is not set
+# CONFIG_DM_SNAPSHOT is not set
+CONFIG_DM_THIN_PROVISIONING=m
+# CONFIG_DM_CACHE is not set
+# CONFIG_DM_WRITECACHE is not set
+# CONFIG_DM_EBS is not set
+# CONFIG_DM_ERA is not set
+# CONFIG_DM_CLONE is not set
+# CONFIG_DM_MIRROR is not set
+# CONFIG_DM_RAID is not set
+# CONFIG_DM_ZERO is not set
+# CONFIG_DM_MULTIPATH is not set
+# CONFIG_DM_DELAY is not set
+# CONFIG_DM_DUST is not set
+# CONFIG_DM_UEVENT is not set
+# CONFIG_DM_FLAKEY is not set
+# CONFIG_DM_VERITY is not set
+# CONFIG_DM_SWITCH is not set
+# CONFIG_DM_LOG_WRITES is not set
+# CONFIG_DM_INTEGRITY is not set
+# CONFIG_TARGET_CORE is not set
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_FIREWIRE is not set
+# CONFIG_FIREWIRE_NOSY is not set
+# end of IEEE 1394 (FireWire) support
+
+CONFIG_NETDEVICES=y
+CONFIG_MII=y
+CONFIG_NET_CORE=y
+# CONFIG_BONDING is not set
+CONFIG_DUMMY=m
+CONFIG_WIREGUARD=m
+# CONFIG_WIREGUARD_DEBUG is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_NET_FC is not set
+# CONFIG_NET_TEAM is not set
+CONFIG_MACVLAN=m
+# CONFIG_MACVTAP is not set
+CONFIG_IPVLAN_L3S=y
+CONFIG_IPVLAN=m
+# CONFIG_IPVTAP is not set
+CONFIG_VXLAN=m
+# CONFIG_GENEVE is not set
+# CONFIG_BAREUDP is not set
+# CONFIG_GTP is not set
+# CONFIG_AMT is not set
+# CONFIG_MACSEC is not set
+CONFIG_NETCONSOLE=y
+CONFIG_NETCONSOLE_DYNAMIC=y
+CONFIG_NETPOLL=y
+CONFIG_NET_POLL_CONTROLLER=y
+CONFIG_TUN=y
+# CONFIG_TUN_VNET_CROSS_LE is not set
+CONFIG_VETH=m
+CONFIG_NLMON=m
+# CONFIG_ARCNET is not set
+CONFIG_ETHERNET=y
+CONFIG_NET_VENDOR_3COM=y
+# CONFIG_VORTEX is not set
+# CONFIG_TYPHOON is not set
+CONFIG_NET_VENDOR_ADAPTEC=y
+# CONFIG_ADAPTEC_STARFIRE is not set
+CONFIG_NET_VENDOR_AGERE=y
+# CONFIG_ET131X is not set
+CONFIG_NET_VENDOR_ALACRITECH=y
+# CONFIG_SLICOSS is not set
+CONFIG_NET_VENDOR_ALTEON=y
+# CONFIG_ACENIC is not set
+# CONFIG_ALTERA_TSE is not set
+CONFIG_NET_VENDOR_AMAZON=y
+# CONFIG_ENA_ETHERNET is not set
+CONFIG_NET_VENDOR_AMD=y
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_PCNET32 is not set
+# CONFIG_AMD_XGBE is not set
+CONFIG_NET_VENDOR_AQUANTIA=y
+# CONFIG_AQTION is not set
+CONFIG_NET_VENDOR_ARC=y
+CONFIG_NET_VENDOR_ASIX=y
+# CONFIG_SPI_AX88796C is not set
+CONFIG_NET_VENDOR_ATHEROS=y
+# CONFIG_ATL2 is not set
+# CONFIG_ATL1 is not set
+# CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
+# CONFIG_ALX is not set
+CONFIG_NET_VENDOR_BROADCOM=y
+# CONFIG_B44 is not set
+CONFIG_BCMGENET=y
+# CONFIG_BNX2 is not set
+# CONFIG_CNIC is not set
+# CONFIG_TIGON3 is not set
+# CONFIG_BNX2X is not set
+# CONFIG_SYSTEMPORT is not set
+# CONFIG_BNXT is not set
+CONFIG_NET_VENDOR_CADENCE=y
+CONFIG_MACB=y
+# CONFIG_MACB_PCI is not set
+CONFIG_NET_VENDOR_CAVIUM=y
+# CONFIG_THUNDER_NIC_PF is not set
+# CONFIG_THUNDER_NIC_VF is not set
+# CONFIG_THUNDER_NIC_BGX is not set
+# CONFIG_THUNDER_NIC_RGX is not set
+# CONFIG_LIQUIDIO is not set
+# CONFIG_LIQUIDIO_VF is not set
+CONFIG_NET_VENDOR_CHELSIO=y
+# CONFIG_CHELSIO_T1 is not set
+# CONFIG_CHELSIO_T3 is not set
+# CONFIG_CHELSIO_T4 is not set
+# CONFIG_CHELSIO_T4VF is not set
+CONFIG_NET_VENDOR_CISCO=y
+# CONFIG_ENIC is not set
+CONFIG_NET_VENDOR_CORTINA=y
+# CONFIG_GEMINI_ETHERNET is not set
+CONFIG_NET_VENDOR_DAVICOM=y
+# CONFIG_DM9051 is not set
+# CONFIG_DNET is not set
+CONFIG_NET_VENDOR_DEC=y
+# CONFIG_NET_TULIP is not set
+CONFIG_NET_VENDOR_DLINK=y
+# CONFIG_DL2K is not set
+# CONFIG_SUNDANCE is not set
+CONFIG_NET_VENDOR_EMULEX=y
+# CONFIG_BE2NET is not set
+CONFIG_NET_VENDOR_ENGLEDER=y
+# CONFIG_TSNEP is not set
+CONFIG_NET_VENDOR_EZCHIP=y
+# CONFIG_EZCHIP_NPS_MANAGEMENT_ENET is not set
+CONFIG_NET_VENDOR_FUNGIBLE=y
+# CONFIG_FUN_ETH is not set
+CONFIG_NET_VENDOR_GOOGLE=y
+# CONFIG_GVE is not set
+CONFIG_NET_VENDOR_HISILICON=y
+# CONFIG_HIX5HD2_GMAC is not set
+# CONFIG_HISI_FEMAC is not set
+# CONFIG_HIP04_ETH is not set
+# CONFIG_HNS_DSAF is not set
+# CONFIG_HNS_ENET is not set
+# CONFIG_HNS3 is not set
+CONFIG_NET_VENDOR_HUAWEI=y
+# CONFIG_HINIC is not set
+CONFIG_NET_VENDOR_I825XX=y
+CONFIG_NET_VENDOR_INTEL=y
+# CONFIG_E100 is not set
+# CONFIG_E1000 is not set
+# CONFIG_E1000E is not set
+# CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
+# CONFIG_IXGB is not set
+# CONFIG_IXGBE is not set
+# CONFIG_IXGBEVF is not set
+# CONFIG_I40E is not set
+# CONFIG_I40EVF is not set
+# CONFIG_ICE is not set
+# CONFIG_FM10K is not set
+# CONFIG_IGC is not set
+CONFIG_NET_VENDOR_WANGXUN=y
+# CONFIG_NGBE is not set
+# CONFIG_TXGBE is not set
+# CONFIG_JME is not set
+CONFIG_NET_VENDOR_ADI=y
+CONFIG_NET_VENDOR_LITEX=y
+# CONFIG_LITEX_LITEETH is not set
+CONFIG_NET_VENDOR_MARVELL=y
+# CONFIG_MVMDIO is not set
+# CONFIG_SKGE is not set
+# CONFIG_SKY2 is not set
+# CONFIG_OCTEONTX2_AF is not set
+# CONFIG_OCTEONTX2_PF is not set
+# CONFIG_OCTEON_EP is not set
+CONFIG_NET_VENDOR_MELLANOX=y
+# CONFIG_MLX4_EN is not set
+# CONFIG_MLX5_CORE is not set
+# CONFIG_MLXSW_CORE is not set
+# CONFIG_MLXFW is not set
+CONFIG_NET_VENDOR_MICREL=y
+# CONFIG_KS8842 is not set
+# CONFIG_KS8851 is not set
+# CONFIG_KS8851_MLL is not set
+# CONFIG_KSZ884X_PCI is not set
+CONFIG_NET_VENDOR_MICROCHIP=y
+# CONFIG_ENC28J60 is not set
+# CONFIG_ENCX24J600 is not set
+# CONFIG_LAN743X is not set
+CONFIG_NET_VENDOR_MICROSEMI=y
+CONFIG_NET_VENDOR_MICROSOFT=y
+CONFIG_NET_VENDOR_MYRI=y
+# CONFIG_MYRI10GE is not set
+# CONFIG_FEALNX is not set
+CONFIG_NET_VENDOR_NI=y
+# CONFIG_NI_XGE_MANAGEMENT_ENET is not set
+CONFIG_NET_VENDOR_NATSEMI=y
+# CONFIG_NATSEMI is not set
+# CONFIG_NS83820 is not set
+CONFIG_NET_VENDOR_NETERION=y
+# CONFIG_S2IO is not set
+CONFIG_NET_VENDOR_NETRONOME=y
+# CONFIG_NFP is not set
+CONFIG_NET_VENDOR_8390=y
+# CONFIG_NE2K_PCI is not set
+CONFIG_NET_VENDOR_NVIDIA=y
+# CONFIG_FORCEDETH is not set
+CONFIG_NET_VENDOR_OKI=y
+# CONFIG_ETHOC is not set
+CONFIG_NET_VENDOR_PACKET_ENGINES=y
+# CONFIG_HAMACHI is not set
+# CONFIG_YELLOWFIN is not set
+# CONFIG_NET_VENDOR_PENSANDO is not set
+CONFIG_NET_VENDOR_QLOGIC=y
+# CONFIG_QLA3XXX is not set
+# CONFIG_QLCNIC is not set
+# CONFIG_NETXEN_NIC is not set
+# CONFIG_QED is not set
+CONFIG_NET_VENDOR_BROCADE=y
+# CONFIG_BNA is not set
+CONFIG_NET_VENDOR_QUALCOMM=y
+# CONFIG_QCA7000_SPI is not set
+# CONFIG_QCA7000_UART is not set
+# CONFIG_QCOM_EMAC is not set
+# CONFIG_RMNET is not set
+CONFIG_NET_VENDOR_RDC=y
+# CONFIG_R6040 is not set
+CONFIG_NET_VENDOR_REALTEK=y
+# CONFIG_8139CP is not set
+# CONFIG_8139TOO is not set
+# CONFIG_R8169 is not set
+CONFIG_NET_VENDOR_RENESAS=y
+CONFIG_NET_VENDOR_ROCKER=y
+CONFIG_NET_VENDOR_SAMSUNG=y
+# CONFIG_SXGBE_ETH is not set
+CONFIG_NET_VENDOR_SEEQ=y
+CONFIG_NET_VENDOR_SILAN=y
+# CONFIG_SC92031 is not set
+CONFIG_NET_VENDOR_SIS=y
+# CONFIG_SIS900 is not set
+# CONFIG_SIS190 is not set
+CONFIG_NET_VENDOR_SOLARFLARE=y
+# CONFIG_SFC is not set
+# CONFIG_SFC_FALCON is not set
+CONFIG_NET_VENDOR_SMSC=y
+# CONFIG_SMC91X is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SMSC911X is not set
+# CONFIG_SMSC9420 is not set
+CONFIG_NET_VENDOR_SOCIONEXT=y
+CONFIG_NET_VENDOR_STMICRO=y
+# CONFIG_STMMAC_ETH is not set
+CONFIG_NET_VENDOR_SUN=y
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+# CONFIG_CASSINI is not set
+# CONFIG_NIU is not set
+CONFIG_NET_VENDOR_SYNOPSYS=y
+# CONFIG_DWC_XLGMAC is not set
+CONFIG_NET_VENDOR_TEHUTI=y
+# CONFIG_TEHUTI is not set
+CONFIG_NET_VENDOR_TI=y
+# CONFIG_TI_CPSW_PHY_SEL is not set
+# CONFIG_TLAN is not set
+CONFIG_NET_VENDOR_VERTEXCOM=y
+# CONFIG_MSE102X is not set
+CONFIG_NET_VENDOR_VIA=y
+# CONFIG_VIA_RHINE is not set
+# CONFIG_VIA_VELOCITY is not set
+CONFIG_NET_VENDOR_WIZNET=y
+# CONFIG_WIZNET_W5100 is not set
+# CONFIG_WIZNET_W5300 is not set
+CONFIG_NET_VENDOR_XILINX=y
+# CONFIG_XILINX_EMACLITE is not set
+# CONFIG_XILINX_AXI_EMAC is not set
+# CONFIG_XILINX_LL_TEMAC is not set
+# CONFIG_FDDI is not set
+# CONFIG_HIPPI is not set
+CONFIG_PHYLINK=y
+CONFIG_PHYLIB=y
+CONFIG_SWPHY=y
+# CONFIG_LED_TRIGGER_PHY is not set
+CONFIG_FIXED_PHY=y
+# CONFIG_SFP is not set
+
+#
+# MII PHY device drivers
+#
+# CONFIG_AMD_PHY is not set
+# CONFIG_ADIN_PHY is not set
+# CONFIG_ADIN1100_PHY is not set
+# CONFIG_AQUANTIA_PHY is not set
+CONFIG_AX88796B_PHY=m
+CONFIG_BROADCOM_PHY=y
+# CONFIG_BCM54140_PHY is not set
+CONFIG_BCM7XXX_PHY=y
+# CONFIG_BCM84881_PHY is not set
+# CONFIG_BCM87XX_PHY is not set
+CONFIG_BCM_NET_PHYLIB=y
+# CONFIG_CICADA_PHY is not set
+# CONFIG_CORTINA_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_INTEL_XWAY_PHY is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_MARVELL_10G_PHY is not set
+# CONFIG_MARVELL_88X2222_PHY is not set
+# CONFIG_MAXLINEAR_GPHY is not set
+# CONFIG_MEDIATEK_GE_PHY is not set
+CONFIG_MICREL_PHY=y
+CONFIG_MICROCHIP_PHY=y
+# CONFIG_MICROCHIP_T1_PHY is not set
+# CONFIG_MICROSEMI_PHY is not set
+# CONFIG_MOTORCOMM_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_NXP_C45_TJA11XX_PHY is not set
+# CONFIG_NXP_TJA11XX_PHY is not set
+# CONFIG_AT803X_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_RENESAS_PHY is not set
+# CONFIG_ROCKCHIP_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_STE10XP is not set
+# CONFIG_TERANETICS_PHY is not set
+# CONFIG_DP83822_PHY is not set
+# CONFIG_DP83TC811_PHY is not set
+# CONFIG_DP83848_PHY is not set
+# CONFIG_DP83867_PHY is not set
+# CONFIG_DP83869_PHY is not set
+# CONFIG_DP83TD510_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+# CONFIG_XILINX_GMII2RGMII is not set
+# CONFIG_MICREL_KS8995MA is not set
+# CONFIG_PSE_CONTROLLER is not set
+CONFIG_MDIO_DEVICE=y
+CONFIG_MDIO_BUS=y
+CONFIG_FWNODE_MDIO=y
+CONFIG_OF_MDIO=y
+CONFIG_MDIO_DEVRES=y
+# CONFIG_MDIO_BITBANG is not set
+CONFIG_MDIO_BCM_UNIMAC=y
+# CONFIG_MDIO_HISI_FEMAC is not set
+# CONFIG_MDIO_MVUSB is not set
+# CONFIG_MDIO_MSCC_MIIM is not set
+# CONFIG_MDIO_OCTEON is not set
+# CONFIG_MDIO_IPQ4019 is not set
+# CONFIG_MDIO_IPQ8064 is not set
+# CONFIG_MDIO_THUNDER is not set
+
+#
+# MDIO Multiplexers
+#
+# CONFIG_MDIO_BUS_MUX_GPIO is not set
+# CONFIG_MDIO_BUS_MUX_MULTIPLEXER is not set
+# CONFIG_MDIO_BUS_MUX_MMIOREG is not set
+
+#
+# PCS device drivers
+#
+# end of PCS device drivers
+
+CONFIG_PPP=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+# CONFIG_PPP_FILTER is not set
+CONFIG_PPP_MPPE=m
+# CONFIG_PPP_MULTILINK is not set
+CONFIG_PPPOE=m
+CONFIG_PPP_ASYNC=m
+# CONFIG_PPP_SYNC_TTY is not set
+# CONFIG_SLIP is not set
+CONFIG_SLHC=m
+CONFIG_USB_NET_DRIVERS=y
+# CONFIG_USB_CATC is not set
+# CONFIG_USB_KAWETH is not set
+# CONFIG_USB_PEGASUS is not set
+# CONFIG_USB_RTL8150 is not set
+CONFIG_USB_RTL8152=m
+CONFIG_USB_LAN78XX=y
+CONFIG_USB_USBNET=y
+CONFIG_USB_NET_AX8817X=m
+CONFIG_USB_NET_AX88179_178A=m
+CONFIG_USB_NET_CDCETHER=m
+# CONFIG_USB_NET_CDC_EEM is not set
+# CONFIG_USB_NET_CDC_NCM is not set
+# CONFIG_USB_NET_HUAWEI_CDC_NCM is not set
+# CONFIG_USB_NET_CDC_MBIM is not set
+CONFIG_USB_NET_DM9601=y
+# CONFIG_USB_NET_SR9700 is not set
+# CONFIG_USB_NET_SR9800 is not set
+CONFIG_USB_NET_SMSC75XX=m
+CONFIG_USB_NET_SMSC95XX=y
+# CONFIG_USB_NET_GL620A is not set
+# CONFIG_USB_NET_NET1080 is not set
+# CONFIG_USB_NET_PLUSB is not set
+CONFIG_USB_NET_MCS7830=m
+CONFIG_USB_NET_RNDIS_HOST=m
+# CONFIG_USB_NET_CDC_SUBSET is not set
+# CONFIG_USB_NET_ZAURUS is not set
+# CONFIG_USB_NET_CX82310_ETH is not set
+# CONFIG_USB_NET_KALMIA is not set
+# CONFIG_USB_NET_QMI_WWAN is not set
+CONFIG_USB_HSO=m
+# CONFIG_USB_NET_INT51X1 is not set
+CONFIG_USB_IPHETH=m
+# CONFIG_USB_SIERRA_NET is not set
+# CONFIG_USB_VL600 is not set
+# CONFIG_USB_NET_CH9200 is not set
+# CONFIG_USB_NET_AQC111 is not set
+CONFIG_USB_RTL8153_ECM=m
+CONFIG_WLAN=y
+CONFIG_WLAN_VENDOR_ADMTEK=y
+# CONFIG_ADM8211 is not set
+CONFIG_ATH_COMMON=m
+CONFIG_WLAN_VENDOR_ATH=y
+# CONFIG_ATH_DEBUG is not set
+# CONFIG_ATH5K is not set
+# CONFIG_ATH5K_PCI is not set
+CONFIG_ATH9K_HW=m
+CONFIG_ATH9K_COMMON=m
+CONFIG_ATH9K_BTCOEX_SUPPORT=y
+CONFIG_ATH9K=m
+CONFIG_ATH9K_PCI=y
+CONFIG_ATH9K_AHB=y
+# CONFIG_ATH9K_DEBUGFS is not set
+# CONFIG_ATH9K_DYNACK is not set
+# CONFIG_ATH9K_WOW is not set
+CONFIG_ATH9K_RFKILL=y
+CONFIG_ATH9K_CHANNEL_CONTEXT=y
+CONFIG_ATH9K_PCOEM=y
+# CONFIG_ATH9K_PCI_NO_EEPROM is not set
+CONFIG_ATH9K_HTC=m
+# CONFIG_ATH9K_HTC_DEBUGFS is not set
+CONFIG_ATH9K_HWRNG=y
+CONFIG_CARL9170=m
+CONFIG_CARL9170_LEDS=y
+CONFIG_CARL9170_WPC=y
+CONFIG_CARL9170_HWRNG=y
+CONFIG_ATH6KL=m
+# CONFIG_ATH6KL_SDIO is not set
+CONFIG_ATH6KL_USB=m
+# CONFIG_ATH6KL_DEBUG is not set
+# CONFIG_ATH6KL_TRACING is not set
+CONFIG_AR5523=m
+# CONFIG_WIL6210 is not set
+# CONFIG_ATH10K is not set
+CONFIG_WCN36XX=m
+# CONFIG_WCN36XX_DEBUGFS is not set
+CONFIG_WLAN_VENDOR_ATMEL=y
+# CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
+CONFIG_WLAN_VENDOR_BROADCOM=y
+CONFIG_B43=m
+CONFIG_B43_BCMA=y
+CONFIG_B43_SSB=y
+CONFIG_B43_BUSES_BCMA_AND_SSB=y
+# CONFIG_B43_BUSES_BCMA is not set
+# CONFIG_B43_BUSES_SSB is not set
+CONFIG_B43_PCI_AUTOSELECT=y
+CONFIG_B43_PCICORE_AUTOSELECT=y
+# CONFIG_B43_SDIO is not set
+CONFIG_B43_BCMA_PIO=y
+CONFIG_B43_PIO=y
+CONFIG_B43_PHY_G=y
+CONFIG_B43_PHY_N=y
+CONFIG_B43_PHY_LP=y
+CONFIG_B43_PHY_HT=y
+CONFIG_B43_LEDS=y
+CONFIG_B43_HWRNG=y
+# CONFIG_B43_DEBUG is not set
+# CONFIG_B43LEGACY is not set
+CONFIG_BRCMUTIL=m
+# CONFIG_BRCMSMAC is not set
+CONFIG_BRCMFMAC=m
+CONFIG_BRCMFMAC_PROTO_BCDC=y
+CONFIG_BRCMFMAC_SDIO=y
+CONFIG_BRCMFMAC_USB=y
+# CONFIG_BRCMFMAC_PCIE is not set
+# CONFIG_BRCM_TRACING is not set
+CONFIG_BRCMDBG=y
+CONFIG_WLAN_VENDOR_CISCO=y
+CONFIG_WLAN_VENDOR_INTEL=y
+# CONFIG_IPW2100 is not set
+# CONFIG_IPW2200 is not set
+# CONFIG_IWL4965 is not set
+# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
+CONFIG_WLAN_VENDOR_INTERSIL=y
+# CONFIG_HOSTAP is not set
+# CONFIG_HERMES is not set
+CONFIG_P54_COMMON=m
+CONFIG_P54_USB=m
+# CONFIG_P54_PCI is not set
+# CONFIG_P54_SPI is not set
+CONFIG_P54_LEDS=y
+CONFIG_WLAN_VENDOR_MARVELL=y
+# CONFIG_LIBERTAS is not set
+# CONFIG_LIBERTAS_THINFIRM is not set
+# CONFIG_MWIFIEX is not set
+# CONFIG_MWL8K is not set
+CONFIG_WLAN_VENDOR_MEDIATEK=y
+CONFIG_MT7601U=m
+CONFIG_MT76_CORE=m
+CONFIG_MT76_LEDS=y
+CONFIG_MT76_USB=m
+CONFIG_MT76x02_LIB=m
+CONFIG_MT76x02_USB=m
+CONFIG_MT76_CONNAC_LIB=m
+CONFIG_MT76x0_COMMON=m
+CONFIG_MT76x0U=m
+# CONFIG_MT76x0E is not set
+CONFIG_MT76x2_COMMON=m
+# CONFIG_MT76x2E is not set
+CONFIG_MT76x2U=m
+# CONFIG_MT7603E is not set
+CONFIG_MT7615_COMMON=m
+# CONFIG_MT7615E is not set
+CONFIG_MT7663_USB_SDIO_COMMON=m
+CONFIG_MT7663U=m
+# CONFIG_MT7663S is not set
+# CONFIG_MT7915E is not set
+# CONFIG_MT7921E is not set
+# CONFIG_MT7921S is not set
+# CONFIG_MT7921U is not set
+CONFIG_WLAN_VENDOR_MICROCHIP=y
+# CONFIG_WILC1000_SDIO is not set
+# CONFIG_WILC1000_SPI is not set
+CONFIG_WLAN_VENDOR_PURELIFI=y
+# CONFIG_PLFXLC is not set
+CONFIG_WLAN_VENDOR_RALINK=y
+CONFIG_RT2X00=m
+# CONFIG_RT2400PCI is not set
+# CONFIG_RT2500PCI is not set
+# CONFIG_RT61PCI is not set
+# CONFIG_RT2800PCI is not set
+CONFIG_RT2500USB=m
+CONFIG_RT73USB=m
+CONFIG_RT2800USB=m
+CONFIG_RT2800USB_RT33XX=y
+CONFIG_RT2800USB_RT35XX=y
+CONFIG_RT2800USB_RT3573=y
+CONFIG_RT2800USB_RT53XX=y
+CONFIG_RT2800USB_RT55XX=y
+CONFIG_RT2800USB_UNKNOWN=y
+CONFIG_RT2800_LIB=m
+CONFIG_RT2X00_LIB_USB=m
+CONFIG_RT2X00_LIB=m
+CONFIG_RT2X00_LIB_FIRMWARE=y
+CONFIG_RT2X00_LIB_CRYPTO=y
+CONFIG_RT2X00_LIB_LEDS=y
+# CONFIG_RT2X00_DEBUG is not set
+CONFIG_WLAN_VENDOR_REALTEK=y
+# CONFIG_RTL8180 is not set
+CONFIG_RTL8187=m
+CONFIG_RTL8187_LEDS=y
+CONFIG_RTL_CARDS=m
+# CONFIG_RTL8192CE is not set
+# CONFIG_RTL8192SE is not set
+# CONFIG_RTL8192DE is not set
+# CONFIG_RTL8723AE is not set
+# CONFIG_RTL8723BE is not set
+# CONFIG_RTL8188EE is not set
+# CONFIG_RTL8192EE is not set
+# CONFIG_RTL8821AE is not set
+# CONFIG_RTL8192CU is not set
+CONFIG_RTL8XXXU=m
+CONFIG_RTL8XXXU_UNTESTED=y
+CONFIG_RTW88=m
+# CONFIG_RTW88_8822BE is not set
+# CONFIG_RTW88_8822CE is not set
+# CONFIG_RTW88_8723DE is not set
+# CONFIG_RTW88_8821CE is not set
+# CONFIG_RTW89 is not set
+CONFIG_WLAN_VENDOR_RSI=y
+# CONFIG_RSI_91X is not set
+CONFIG_WLAN_VENDOR_SILABS=y
+# CONFIG_WFX is not set
+CONFIG_WLAN_VENDOR_ST=y
+# CONFIG_CW1200 is not set
+CONFIG_WLAN_VENDOR_TI=y
+# CONFIG_WL1251 is not set
+# CONFIG_WL12XX is not set
+# CONFIG_WL18XX is not set
+# CONFIG_WLCORE is not set
+CONFIG_WLAN_VENDOR_ZYDAS=y
+CONFIG_USB_ZD1201=m
+CONFIG_ZD1211RW=m
+# CONFIG_ZD1211RW_DEBUG is not set
+# CONFIG_WLAN_VENDOR_QUANTENNA is not set
+# CONFIG_MAC80211_HWSIM is not set
+CONFIG_USB_NET_RNDIS_WLAN=m
+# CONFIG_VIRT_WIFI is not set
+# CONFIG_WAN is not set
+
+#
+# Wireless WAN
+#
+# CONFIG_WWAN is not set
+# end of Wireless WAN
+
+# CONFIG_VMXNET3 is not set
+# CONFIG_NETDEVSIM is not set
+# CONFIG_NET_FAILOVER is not set
+# CONFIG_ISDN is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+CONFIG_INPUT_LEDS=y
+CONFIG_INPUT_FF_MEMLESS=y
+# CONFIG_INPUT_SPARSEKMAP is not set
+# CONFIG_INPUT_MATRIXKMAP is not set
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+CONFIG_INPUT_JOYDEV=y
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+# CONFIG_KEYBOARD_ADP5588 is not set
+# CONFIG_KEYBOARD_ADP5589 is not set
+# CONFIG_KEYBOARD_ATKBD is not set
+# CONFIG_KEYBOARD_QT1050 is not set
+# CONFIG_KEYBOARD_QT1070 is not set
+# CONFIG_KEYBOARD_QT2160 is not set
+# CONFIG_KEYBOARD_DLINK_DIR685 is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+CONFIG_KEYBOARD_GPIO=m
+# CONFIG_KEYBOARD_GPIO_POLLED is not set
+# CONFIG_KEYBOARD_TCA6416 is not set
+# CONFIG_KEYBOARD_TCA8418 is not set
+# CONFIG_KEYBOARD_MATRIX is not set
+# CONFIG_KEYBOARD_LM8323 is not set
+# CONFIG_KEYBOARD_LM8333 is not set
+# CONFIG_KEYBOARD_MAX7359 is not set
+# CONFIG_KEYBOARD_MCS is not set
+# CONFIG_KEYBOARD_MPR121 is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_OPENCORES is not set
+# CONFIG_KEYBOARD_PINEPHONE is not set
+# CONFIG_KEYBOARD_SAMSUNG is not set
+# CONFIG_KEYBOARD_STOWAWAY is not set
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_OMAP4 is not set
+# CONFIG_KEYBOARD_TM2_TOUCHKEY is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_CAP11XX is not set
+# CONFIG_KEYBOARD_BCM is not set
+# CONFIG_KEYBOARD_CYPRESS_SF is not set
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+# CONFIG_JOYSTICK_ANALOG is not set
+# CONFIG_JOYSTICK_A3D is not set
+# CONFIG_JOYSTICK_ADI is not set
+# CONFIG_JOYSTICK_COBRA is not set
+# CONFIG_JOYSTICK_GF2K is not set
+# CONFIG_JOYSTICK_GRIP is not set
+# CONFIG_JOYSTICK_GRIP_MP is not set
+# CONFIG_JOYSTICK_GUILLEMOT is not set
+# CONFIG_JOYSTICK_INTERACT is not set
+# CONFIG_JOYSTICK_SIDEWINDER is not set
+# CONFIG_JOYSTICK_TMDC is not set
+# CONFIG_JOYSTICK_IFORCE is not set
+# CONFIG_JOYSTICK_WARRIOR is not set
+# CONFIG_JOYSTICK_MAGELLAN is not set
+# CONFIG_JOYSTICK_SPACEORB is not set
+# CONFIG_JOYSTICK_SPACEBALL is not set
+# CONFIG_JOYSTICK_STINGER is not set
+# CONFIG_JOYSTICK_TWIDJOY is not set
+# CONFIG_JOYSTICK_ZHENHUA is not set
+# CONFIG_JOYSTICK_AS5011 is not set
+# CONFIG_JOYSTICK_JOYDUMP is not set
+CONFIG_JOYSTICK_XPAD=m
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_JOYSTICK_PSXPAD_SPI=m
+CONFIG_JOYSTICK_PSXPAD_SPI_FF=y
+# CONFIG_JOYSTICK_PXRC is not set
+# CONFIG_JOYSTICK_QWIIC is not set
+# CONFIG_JOYSTICK_FSIA6B is not set
+# CONFIG_JOYSTICK_SENSEHAT is not set
+# CONFIG_JOYSTICK_RPISENSE is not set
+# CONFIG_INPUT_TABLET is not set
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_TOUCHSCREEN_ADS7846=m
+# CONFIG_TOUCHSCREEN_AD7877 is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
+# CONFIG_TOUCHSCREEN_AR1021_I2C is not set
+# CONFIG_TOUCHSCREEN_ATMEL_MXT is not set
+# CONFIG_TOUCHSCREEN_AUO_PIXCIR is not set
+# CONFIG_TOUCHSCREEN_BU21013 is not set
+# CONFIG_TOUCHSCREEN_BU21029 is not set
+# CONFIG_TOUCHSCREEN_CHIPONE_ICN8318 is not set
+# CONFIG_TOUCHSCREEN_CY8CTMA140 is not set
+# CONFIG_TOUCHSCREEN_CY8CTMG110 is not set
+# CONFIG_TOUCHSCREEN_CYTTSP_CORE is not set
+# CONFIG_TOUCHSCREEN_CYTTSP4_CORE is not set
+# CONFIG_TOUCHSCREEN_DYNAPRO is not set
+# CONFIG_TOUCHSCREEN_HAMPSHIRE is not set
+# CONFIG_TOUCHSCREEN_EETI is not set
+CONFIG_TOUCHSCREEN_EGALAX=m
+# CONFIG_TOUCHSCREEN_EGALAX_SERIAL is not set
+# CONFIG_TOUCHSCREEN_EXC3000 is not set
+# CONFIG_TOUCHSCREEN_FUJITSU is not set
+# CONFIG_TOUCHSCREEN_GOODIX is not set
+# CONFIG_TOUCHSCREEN_HIDEEP is not set
+# CONFIG_TOUCHSCREEN_HYCON_HY46XX is not set
+# CONFIG_TOUCHSCREEN_ILI210X is not set
+# CONFIG_TOUCHSCREEN_ILITEK is not set
+# CONFIG_TOUCHSCREEN_S6SY761 is not set
+# CONFIG_TOUCHSCREEN_GUNZE is not set
+# CONFIG_TOUCHSCREEN_EKTF2127 is not set
+# CONFIG_TOUCHSCREEN_ELAN is not set
+# CONFIG_TOUCHSCREEN_ELO is not set
+# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
+# CONFIG_TOUCHSCREEN_WACOM_I2C is not set
+# CONFIG_TOUCHSCREEN_MAX11801 is not set
+# CONFIG_TOUCHSCREEN_MCS5000 is not set
+# CONFIG_TOUCHSCREEN_MMS114 is not set
+# CONFIG_TOUCHSCREEN_MELFAS_MIP4 is not set
+# CONFIG_TOUCHSCREEN_MSG2638 is not set
+# CONFIG_TOUCHSCREEN_MTOUCH is not set
+# CONFIG_TOUCHSCREEN_IMAGIS is not set
+# CONFIG_TOUCHSCREEN_IMX6UL_TSC is not set
+# CONFIG_TOUCHSCREEN_INEXIO is not set
+# CONFIG_TOUCHSCREEN_MK712 is not set
+# CONFIG_TOUCHSCREEN_PENMOUNT is not set
+CONFIG_TOUCHSCREEN_EDT_FT5X06=m
+CONFIG_TOUCHSCREEN_RASPBERRYPI_FW=m
+# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
+# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
+# CONFIG_TOUCHSCREEN_PIXCIR is not set
+# CONFIG_TOUCHSCREEN_WDT87XX_I2C is not set
+CONFIG_TOUCHSCREEN_USB_COMPOSITE=m
+CONFIG_TOUCHSCREEN_USB_EGALAX=y
+# CONFIG_TOUCHSCREEN_USB_PANJIT is not set
+CONFIG_TOUCHSCREEN_USB_3M=y
+# CONFIG_TOUCHSCREEN_USB_ITM is not set
+# CONFIG_TOUCHSCREEN_USB_ETURBO is not set
+# CONFIG_TOUCHSCREEN_USB_GUNZE is not set
+# CONFIG_TOUCHSCREEN_USB_DMC_TSC10 is not set
+# CONFIG_TOUCHSCREEN_USB_IRTOUCH is not set
+# CONFIG_TOUCHSCREEN_USB_IDEALTEK is not set
+# CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH is not set
+# CONFIG_TOUCHSCREEN_USB_GOTOP is not set
+# CONFIG_TOUCHSCREEN_USB_JASTEC is not set
+# CONFIG_TOUCHSCREEN_USB_ELO is not set
+# CONFIG_TOUCHSCREEN_USB_E2I is not set
+# CONFIG_TOUCHSCREEN_USB_ZYTRONIC is not set
+# CONFIG_TOUCHSCREEN_USB_ETT_TC45USB is not set
+# CONFIG_TOUCHSCREEN_USB_NEXIO is not set
+# CONFIG_TOUCHSCREEN_USB_EASYTOUCH is not set
+# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
+# CONFIG_TOUCHSCREEN_TSC_SERIO is not set
+# CONFIG_TOUCHSCREEN_TSC2004 is not set
+# CONFIG_TOUCHSCREEN_TSC2005 is not set
+# CONFIG_TOUCHSCREEN_TSC2007 is not set
+# CONFIG_TOUCHSCREEN_RM_TS is not set
+# CONFIG_TOUCHSCREEN_SILEAD is not set
+# CONFIG_TOUCHSCREEN_SIS_I2C is not set
+CONFIG_TOUCHSCREEN_ST1232=m
+# CONFIG_TOUCHSCREEN_STMFTS is not set
+# CONFIG_TOUCHSCREEN_SUR40 is not set
+# CONFIG_TOUCHSCREEN_SURFACE3_SPI is not set
+# CONFIG_TOUCHSCREEN_SX8654 is not set
+# CONFIG_TOUCHSCREEN_TPS6507X is not set
+# CONFIG_TOUCHSCREEN_ZET6223 is not set
+# CONFIG_TOUCHSCREEN_ZFORCE is not set
+# CONFIG_TOUCHSCREEN_ROHM_BU21023 is not set
+# CONFIG_TOUCHSCREEN_IQS5XX is not set
+# CONFIG_TOUCHSCREEN_ZINITIX is not set
+CONFIG_INPUT_MISC=y
+# CONFIG_INPUT_AD714X is not set
+# CONFIG_INPUT_ARIZONA_HAPTICS is not set
+# CONFIG_INPUT_ATMEL_CAPTOUCH is not set
+# CONFIG_INPUT_BMA150 is not set
+# CONFIG_INPUT_E3X0_BUTTON is not set
+# CONFIG_INPUT_MMA8450 is not set
+# CONFIG_INPUT_GPIO_BEEPER is not set
+# CONFIG_INPUT_GPIO_DECODER is not set
+# CONFIG_INPUT_GPIO_VIBRA is not set
+# CONFIG_INPUT_ATI_REMOTE2 is not set
+# CONFIG_INPUT_KEYSPAN_REMOTE is not set
+# CONFIG_INPUT_KXTJ9 is not set
+# CONFIG_INPUT_POWERMATE is not set
+# CONFIG_INPUT_YEALINK is not set
+# CONFIG_INPUT_CM109 is not set
+# CONFIG_INPUT_REGULATOR_HAPTIC is not set
+CONFIG_INPUT_UINPUT=y
+# CONFIG_INPUT_PCF8574 is not set
+# CONFIG_INPUT_PWM_BEEPER is not set
+# CONFIG_INPUT_PWM_VIBRA is not set
+CONFIG_INPUT_GPIO_ROTARY_ENCODER=m
+# CONFIG_INPUT_DA7280_HAPTICS is not set
+# CONFIG_INPUT_ADXL34X is not set
+# CONFIG_INPUT_IMS_PCU is not set
+# CONFIG_INPUT_IQS269A is not set
+# CONFIG_INPUT_IQS626A is not set
+# CONFIG_INPUT_IQS7222 is not set
+# CONFIG_INPUT_CMA3000 is not set
+# CONFIG_INPUT_DRV260X_HAPTICS is not set
+# CONFIG_INPUT_DRV2665_HAPTICS is not set
+# CONFIG_INPUT_DRV2667_HAPTICS is not set
+CONFIG_INPUT_RASPBERRYPI_BUTTON=y
+CONFIG_RMI4_CORE=y
+# CONFIG_RMI4_I2C is not set
+# CONFIG_RMI4_SPI is not set
+# CONFIG_RMI4_SMB is not set
+CONFIG_RMI4_F03=y
+CONFIG_RMI4_F03_SERIO=y
+CONFIG_RMI4_2D_SENSOR=y
+CONFIG_RMI4_F11=y
+CONFIG_RMI4_F12=y
+CONFIG_RMI4_F30=y
+# CONFIG_RMI4_F34 is not set
+# CONFIG_RMI4_F3A is not set
+# CONFIG_RMI4_F55 is not set
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_SERPORT=y
+# CONFIG_SERIO_AMBAKMI is not set
+# CONFIG_SERIO_PCIPS2 is not set
+# CONFIG_SERIO_LIBPS2 is not set
+# CONFIG_SERIO_RAW is not set
+# CONFIG_SERIO_ALTERA_PS2 is not set
+# CONFIG_SERIO_PS2MULT is not set
+# CONFIG_SERIO_ARC_PS2 is not set
+# CONFIG_SERIO_APBPS2 is not set
+# CONFIG_SERIO_GPIO_PS2 is not set
+# CONFIG_USERIO is not set
+# CONFIG_GAMEPORT is not set
+# end of Hardware I/O ports
+# end of Input device support
+
+#
+# Character devices
+#
+CONFIG_BRCM_CHAR_DRIVERS=y
+# CONFIG_BCM2708_VCMEM is not set
+CONFIG_BCM_VCIO=y
+CONFIG_BCM2835_SMI_DEV=m
+# CONFIG_RPIVID_MEM is not set
+CONFIG_TTY=y
+CONFIG_VT=y
+CONFIG_CONSOLE_TRANSLATIONS=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_LDISC_AUTOLOAD=y
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_EARLYCON=y
+CONFIG_SERIAL_8250=y
+# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set
+CONFIG_SERIAL_8250_16550A_VARIANTS=y
+# CONFIG_SERIAL_8250_FINTEK is not set
+CONFIG_SERIAL_8250_CONSOLE=y
+# CONFIG_SERIAL_8250_DMA is not set
+CONFIG_SERIAL_8250_PCI=y
+CONFIG_SERIAL_8250_EXAR=y
+CONFIG_SERIAL_8250_NR_UARTS=1
+CONFIG_SERIAL_8250_RUNTIME_UARTS=0
+CONFIG_SERIAL_8250_EXTENDED=y
+# CONFIG_SERIAL_8250_MANY_PORTS is not set
+CONFIG_SERIAL_8250_SHARE_IRQ=y
+# CONFIG_SERIAL_8250_DETECT_IRQ is not set
+# CONFIG_SERIAL_8250_RSA is not set
+CONFIG_SERIAL_8250_BCM2835AUX=y
+CONFIG_SERIAL_8250_FSL=y
+# CONFIG_SERIAL_8250_DW is not set
+# CONFIG_SERIAL_8250_RT288X is not set
+CONFIG_SERIAL_8250_PERICOM=y
+CONFIG_SERIAL_8250_BCM7271=y
+CONFIG_SERIAL_OF_PLATFORM=y
+
+#
+# Non-8250 serial port support
+#
+# CONFIG_SERIAL_AMBA_PL010 is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+# CONFIG_SERIAL_EARLYCON_ARM_SEMIHOST is not set
+# CONFIG_SERIAL_MAX3100 is not set
+# CONFIG_SERIAL_MAX310X is not set
+# CONFIG_SERIAL_UARTLITE is not set
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_JSM is not set
+# CONFIG_SERIAL_SIFIVE is not set
+# CONFIG_SERIAL_SCCNXP is not set
+# CONFIG_SERIAL_SC16IS7XX is not set
+# CONFIG_SERIAL_ALTERA_JTAGUART is not set
+# CONFIG_SERIAL_ALTERA_UART is not set
+# CONFIG_SERIAL_XILINX_PS_UART is not set
+# CONFIG_SERIAL_ARC is not set
+# CONFIG_SERIAL_RP2 is not set
+# CONFIG_SERIAL_FSL_LPUART is not set
+# CONFIG_SERIAL_FSL_LINFLEXUART is not set
+# CONFIG_SERIAL_CONEXANT_DIGICOLOR is not set
+# CONFIG_SERIAL_SPRD is not set
+# end of Serial drivers
+
+CONFIG_SERIAL_MCTRL_GPIO=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+# CONFIG_N_GSM is not set
+# CONFIG_NOZOMI is not set
+# CONFIG_NULL_TTY is not set
+# CONFIG_HVC_DCC is not set
+CONFIG_SERIAL_DEV_BUS=y
+CONFIG_SERIAL_DEV_CTRL_TTYPORT=y
+# CONFIG_TTY_PRINTK is not set
+# CONFIG_VIRTIO_CONSOLE is not set
+# CONFIG_IPMI_HANDLER is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
+# CONFIG_HW_RANDOM_BA431 is not set
+CONFIG_HW_RANDOM_BCM2835=y
+CONFIG_HW_RANDOM_IPROC_RNG200=y
+# CONFIG_HW_RANDOM_CCTRNG is not set
+# CONFIG_HW_RANDOM_XIPHERA is not set
+CONFIG_HW_RANDOM_ARM_SMCCC_TRNG=y
+CONFIG_HW_RANDOM_CN10K=y
+# CONFIG_APPLICOM is not set
+CONFIG_DEVMEM=y
+CONFIG_DEVPORT=y
+# CONFIG_TCG_TPM is not set
+# CONFIG_XILLYBUS is not set
+# CONFIG_XILLYUSB is not set
+CONFIG_RANDOM_TRUST_CPU=y
+CONFIG_RANDOM_TRUST_BOOTLOADER=y
+CONFIG_RASPBERRYPI_GPIOMEM=y
+# end of Character devices
+
+#
+# I2C support
+#
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+# CONFIG_I2C_COMPAT is not set
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_MUX=y
+
+#
+# Multiplexer I2C Chip support
+#
+# CONFIG_I2C_ARB_GPIO_CHALLENGE is not set
+# CONFIG_I2C_MUX_GPIO is not set
+# CONFIG_I2C_MUX_GPMUX is not set
+# CONFIG_I2C_MUX_LTC4306 is not set
+# CONFIG_I2C_MUX_PCA9541 is not set
+# CONFIG_I2C_MUX_PCA954x is not set
+CONFIG_I2C_MUX_PINCTRL=y
+# CONFIG_I2C_MUX_REG is not set
+# CONFIG_I2C_DEMUX_PINCTRL is not set
+# CONFIG_I2C_MUX_MLXCPLD is not set
+# end of Multiplexer I2C Chip support
+
+CONFIG_I2C_HELPER_AUTO=y
+CONFIG_I2C_ALGOBIT=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# PC SMBus host controller drivers
+#
+CONFIG_I2C_BCM2708=y
+CONFIG_I2C_BCM2708_BAUDRATE=100000
+# CONFIG_I2C_ALI1535 is not set
+# CONFIG_I2C_ALI1563 is not set
+# CONFIG_I2C_ALI15X3 is not set
+# CONFIG_I2C_AMD756 is not set
+# CONFIG_I2C_AMD8111 is not set
+# CONFIG_I2C_I801 is not set
+# CONFIG_I2C_ISCH is not set
+# CONFIG_I2C_PIIX4 is not set
+# CONFIG_I2C_NFORCE2 is not set
+# CONFIG_I2C_NVIDIA_GPU is not set
+# CONFIG_I2C_SIS5595 is not set
+# CONFIG_I2C_SIS630 is not set
+# CONFIG_I2C_SIS96X is not set
+# CONFIG_I2C_VIA is not set
+# CONFIG_I2C_VIAPRO is not set
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+CONFIG_I2C_BCM2835=y
+CONFIG_I2C_BRCMSTB=y
+# CONFIG_I2C_CADENCE is not set
+# CONFIG_I2C_CBUS_GPIO is not set
+CONFIG_I2C_DESIGNWARE_CORE=y
+# CONFIG_I2C_DESIGNWARE_SLAVE is not set
+CONFIG_I2C_DESIGNWARE_PLATFORM=y
+# CONFIG_I2C_DESIGNWARE_PCI is not set
+# CONFIG_I2C_EMEV2 is not set
+CONFIG_I2C_GPIO=y
+# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set
+# CONFIG_I2C_NOMADIK is not set
+# CONFIG_I2C_OCORES is not set
+# CONFIG_I2C_PCA_PLATFORM is not set
+# CONFIG_I2C_RK3X is not set
+# CONFIG_I2C_SIMTEC is not set
+# CONFIG_I2C_THUNDERX is not set
+# CONFIG_I2C_XILINX is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_DIOLAN_U2C is not set
+# CONFIG_I2C_CP2615 is not set
+# CONFIG_I2C_PCI1XXXX is not set
+# CONFIG_I2C_ROBOTFUZZ_OSIF is not set
+# CONFIG_I2C_TAOS_EVM is not set
+# CONFIG_I2C_TINY_USB is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_VIRTIO is not set
+# end of I2C Hardware Bus support
+
+# CONFIG_I2C_STUB is not set
+# CONFIG_I2C_SLAVE is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# end of I2C support
+
+# CONFIG_I3C is not set
+CONFIG_SPI=y
+# CONFIG_SPI_DEBUG is not set
+CONFIG_SPI_MASTER=y
+# CONFIG_SPI_MEM is not set
+
+#
+# SPI Master Controller Drivers
+#
+# CONFIG_SPI_ALTERA is not set
+# CONFIG_SPI_AXI_SPI_ENGINE is not set
+CONFIG_SPI_BCM2835=m
+CONFIG_SPI_BCM2835AUX=m
+# CONFIG_SPI_BCM_QSPI is not set
+# CONFIG_SPI_BITBANG is not set
+# CONFIG_SPI_CADENCE is not set
+# CONFIG_SPI_CADENCE_QUADSPI is not set
+CONFIG_SPI_DESIGNWARE=m
+CONFIG_SPI_DW_DMA=y
+# CONFIG_SPI_DW_PCI is not set
+CONFIG_SPI_DW_MMIO=m
+# CONFIG_SPI_NXP_FLEXSPI is not set
+# CONFIG_SPI_GPIO is not set
+# CONFIG_SPI_FSL_SPI is not set
+# CONFIG_SPI_MICROCHIP_CORE is not set
+# CONFIG_SPI_MICROCHIP_CORE_QSPI is not set
+# CONFIG_SPI_OC_TINY is not set
+# CONFIG_SPI_PL022 is not set
+# CONFIG_SPI_PXA2XX is not set
+# CONFIG_SPI_ROCKCHIP is not set
+# CONFIG_SPI_SC18IS602 is not set
+# CONFIG_SPI_SIFIVE is not set
+# CONFIG_SPI_MXIC is not set
+# CONFIG_SPI_THUNDERX is not set
+# CONFIG_SPI_XCOMM is not set
+# CONFIG_SPI_XILINX is not set
+# CONFIG_SPI_ZYNQMP_GQSPI is not set
+# CONFIG_SPI_AMD is not set
+
+#
+# SPI Multiplexer support
+#
+# CONFIG_SPI_MUX is not set
+
+#
+# SPI Protocol Masters
+#
+CONFIG_SPI_SPIDEV=y
+# CONFIG_SPI_LOOPBACK_TEST is not set
+# CONFIG_SPI_TLE62X0 is not set
+# CONFIG_SPI_SLAVE is not set
+CONFIG_SPI_DYNAMIC=y
+# CONFIG_SPMI is not set
+# CONFIG_HSI is not set
+# CONFIG_PPS is not set
+
+#
+# PTP clock support
+#
+# CONFIG_PTP_1588_CLOCK is not set
+CONFIG_PTP_1588_CLOCK_OPTIONAL=y
+
+#
+# Enable PHYLIB and NETWORK_PHY_TIMESTAMPING to see the additional clocks.
+#
+# end of PTP clock support
+
+CONFIG_PINCTRL=y
+CONFIG_PINMUX=y
+CONFIG_PINCONF=y
+CONFIG_GENERIC_PINCONF=y
+# CONFIG_DEBUG_PINCTRL is not set
+# CONFIG_PINCTRL_CY8C95X0 is not set
+# CONFIG_PINCTRL_MCP23S08 is not set
+# CONFIG_PINCTRL_MICROCHIP_SGPIO is not set
+# CONFIG_PINCTRL_OCELOT is not set
+# CONFIG_PINCTRL_SINGLE is not set
+# CONFIG_PINCTRL_STMFX is not set
+# CONFIG_PINCTRL_SX150X is not set
+CONFIG_PINCTRL_RP1=y
+CONFIG_PINCTRL_BCM2712=y
+CONFIG_PINCTRL_BCM2835=y
+
+#
+# Renesas pinctrl drivers
+#
+# end of Renesas pinctrl drivers
+
+CONFIG_GPIOLIB=y
+CONFIG_GPIOLIB_FASTPATH_LIMIT=512
+CONFIG_OF_GPIO=y
+CONFIG_GPIOLIB_IRQCHIP=y
+# CONFIG_DEBUG_GPIO is not set
+CONFIG_GPIO_SYSFS=y
+CONFIG_GPIO_CDEV=y
+CONFIG_GPIO_CDEV_V1=y
+CONFIG_GPIO_GENERIC=y
+
+#
+# Memory mapped GPIO drivers
+#
+# CONFIG_GPIO_74XX_MMIO is not set
+# CONFIG_GPIO_ALTERA is not set
+CONFIG_GPIO_RASPBERRYPI_EXP=y
+CONFIG_GPIO_BCM_VIRT=y
+CONFIG_GPIO_BRCMSTB=y
+# CONFIG_GPIO_CADENCE is not set
+# CONFIG_GPIO_DWAPB is not set
+# CONFIG_GPIO_EXAR is not set
+# CONFIG_GPIO_FTGPIO010 is not set
+# CONFIG_GPIO_GENERIC_PLATFORM is not set
+# CONFIG_GPIO_GRGPIO is not set
+# CONFIG_GPIO_HLWD is not set
+# CONFIG_GPIO_LOGICVC is not set
+# CONFIG_GPIO_MB86S7X is not set
+# CONFIG_GPIO_PL061 is not set
+# CONFIG_GPIO_PWM is not set
+# CONFIG_GPIO_SIFIVE is not set
+# CONFIG_GPIO_SYSCON is not set
+# CONFIG_GPIO_XGENE is not set
+# CONFIG_GPIO_XILINX is not set
+# CONFIG_GPIO_AMD_FCH is not set
+# end of Memory mapped GPIO drivers
+
+#
+# I2C GPIO expanders
+#
+# CONFIG_GPIO_ADNP is not set
+# CONFIG_GPIO_GW_PLD is not set
+# CONFIG_GPIO_MAX7300 is not set
+# CONFIG_GPIO_MAX732X is not set
+# CONFIG_GPIO_PCA953X is not set
+# CONFIG_GPIO_PCA9570 is not set
+# CONFIG_GPIO_PCF857X is not set
+# CONFIG_GPIO_TPIC2810 is not set
+# end of I2C GPIO expanders
+
+#
+# MFD GPIO expanders
+#
+CONFIG_GPIO_ARIZONA=m
+CONFIG_GPIO_FSM=y
+# end of MFD GPIO expanders
+
+#
+# PCI GPIO expanders
+#
+# CONFIG_GPIO_BT8XX is not set
+# CONFIG_GPIO_PCI_IDIO_16 is not set
+# CONFIG_GPIO_PCIE_IDIO_24 is not set
+# CONFIG_GPIO_RDC321X is not set
+# end of PCI GPIO expanders
+
+#
+# SPI GPIO expanders
+#
+# CONFIG_GPIO_74X164 is not set
+# CONFIG_GPIO_MAX3191X is not set
+# CONFIG_GPIO_MAX7301 is not set
+# CONFIG_GPIO_MC33880 is not set
+# CONFIG_GPIO_PISOSR is not set
+# CONFIG_GPIO_XRA1403 is not set
+# end of SPI GPIO expanders
+
+#
+# USB GPIO expanders
+#
+# end of USB GPIO expanders
+
+#
+# Virtual GPIO drivers
+#
+# CONFIG_GPIO_AGGREGATOR is not set
+# CONFIG_GPIO_MOCKUP is not set
+# CONFIG_GPIO_SIM is not set
+# end of Virtual GPIO drivers
+
+CONFIG_W1=m
+
+#
+# 1-wire Bus Masters
+#
+# CONFIG_W1_MASTER_MATROX is not set
+# CONFIG_W1_MASTER_DS2490 is not set
+# CONFIG_W1_MASTER_DS2482 is not set
+# CONFIG_W1_MASTER_DS1WM is not set
+CONFIG_W1_MASTER_GPIO=m
+# CONFIG_W1_MASTER_SGI is not set
+# end of 1-wire Bus Masters
+
+#
+# 1-wire Slaves
+#
+CONFIG_W1_SLAVE_THERM=m
+# CONFIG_W1_SLAVE_SMEM is not set
+# CONFIG_W1_SLAVE_DS2405 is not set
+# CONFIG_W1_SLAVE_DS2408 is not set
+# CONFIG_W1_SLAVE_DS2413 is not set
+# CONFIG_W1_SLAVE_DS2406 is not set
+# CONFIG_W1_SLAVE_DS2423 is not set
+# CONFIG_W1_SLAVE_DS2805 is not set
+# CONFIG_W1_SLAVE_DS2430 is not set
+# CONFIG_W1_SLAVE_DS2431 is not set
+# CONFIG_W1_SLAVE_DS2433 is not set
+# CONFIG_W1_SLAVE_DS2438 is not set
+# CONFIG_W1_SLAVE_DS250X is not set
+# CONFIG_W1_SLAVE_DS2780 is not set
+# CONFIG_W1_SLAVE_DS2781 is not set
+# CONFIG_W1_SLAVE_DS28E04 is not set
+# CONFIG_W1_SLAVE_DS28E17 is not set
+# end of 1-wire Slaves
+
+CONFIG_POWER_RESET=y
+# CONFIG_POWER_RESET_BRCMSTB is not set
+CONFIG_POWER_RESET_GPIO=y
+CONFIG_POWER_RESET_GPIO_RESTART=y
+# CONFIG_POWER_RESET_LTC2952 is not set
+# CONFIG_POWER_RESET_REGULATOR is not set
+CONFIG_POWER_RESET_RESTART=y
+# CONFIG_POWER_RESET_XGENE is not set
+# CONFIG_POWER_RESET_SYSCON is not set
+# CONFIG_POWER_RESET_SYSCON_POWEROFF is not set
+# CONFIG_SYSCON_REBOOT_MODE is not set
+# CONFIG_NVMEM_REBOOT_MODE is not set
+CONFIG_POWER_SUPPLY=y
+# CONFIG_POWER_SUPPLY_DEBUG is not set
+CONFIG_POWER_SUPPLY_HWMON=y
+CONFIG_RPI_POE_POWER=m
+# CONFIG_PDA_POWER is not set
+# CONFIG_IP5XXX_POWER is not set
+# CONFIG_TEST_POWER is not set
+# CONFIG_CHARGER_ADP5061 is not set
+# CONFIG_BATTERY_CW2015 is not set
+# CONFIG_BATTERY_DS2760 is not set
+# CONFIG_BATTERY_DS2780 is not set
+# CONFIG_BATTERY_DS2781 is not set
+# CONFIG_BATTERY_DS2782 is not set
+# CONFIG_BATTERY_SAMSUNG_SDI is not set
+# CONFIG_BATTERY_SBS is not set
+# CONFIG_CHARGER_SBS is not set
+# CONFIG_MANAGER_SBS is not set
+# CONFIG_BATTERY_BQ27XXX is not set
+# CONFIG_BATTERY_MAX17040 is not set
+# CONFIG_BATTERY_MAX17042 is not set
+# CONFIG_BATTERY_MAX1721X is not set
+# CONFIG_CHARGER_ISP1704 is not set
+# CONFIG_CHARGER_MAX8903 is not set
+# CONFIG_CHARGER_LP8727 is not set
+# CONFIG_CHARGER_GPIO is not set
+# CONFIG_CHARGER_MANAGER is not set
+# CONFIG_CHARGER_LT3651 is not set
+# CONFIG_CHARGER_LTC4162L is not set
+# CONFIG_CHARGER_DETECTOR_MAX14656 is not set
+# CONFIG_CHARGER_MAX77976 is not set
+# CONFIG_CHARGER_BQ2415X is not set
+# CONFIG_CHARGER_BQ24190 is not set
+# CONFIG_CHARGER_BQ24257 is not set
+# CONFIG_CHARGER_BQ24735 is not set
+# CONFIG_CHARGER_BQ2515X is not set
+# CONFIG_CHARGER_BQ25890 is not set
+# CONFIG_CHARGER_BQ25980 is not set
+# CONFIG_CHARGER_BQ256XX is not set
+# CONFIG_CHARGER_SMB347 is not set
+# CONFIG_BATTERY_GAUGE_LTC2941 is not set
+# CONFIG_BATTERY_GOLDFISH is not set
+# CONFIG_BATTERY_RT5033 is not set
+# CONFIG_CHARGER_RT9455 is not set
+# CONFIG_CHARGER_UCS1002 is not set
+# CONFIG_CHARGER_BD99954 is not set
+# CONFIG_BATTERY_UG3105 is not set
+CONFIG_HWMON=y
+# CONFIG_HWMON_DEBUG_CHIP is not set
+
+#
+# Native drivers
+#
+# CONFIG_SENSORS_AD7314 is not set
+# CONFIG_SENSORS_AD7414 is not set
+# CONFIG_SENSORS_AD7418 is not set
+# CONFIG_SENSORS_ADM1021 is not set
+# CONFIG_SENSORS_ADM1025 is not set
+# CONFIG_SENSORS_ADM1026 is not set
+# CONFIG_SENSORS_ADM1029 is not set
+# CONFIG_SENSORS_ADM1031 is not set
+# CONFIG_SENSORS_ADM1177 is not set
+# CONFIG_SENSORS_ADM9240 is not set
+# CONFIG_SENSORS_ADT7310 is not set
+# CONFIG_SENSORS_ADT7410 is not set
+# CONFIG_SENSORS_ADT7411 is not set
+# CONFIG_SENSORS_ADT7462 is not set
+# CONFIG_SENSORS_ADT7470 is not set
+# CONFIG_SENSORS_ADT7475 is not set
+# CONFIG_SENSORS_AHT10 is not set
+# CONFIG_SENSORS_AQUACOMPUTER_D5NEXT is not set
+# CONFIG_SENSORS_AS370 is not set
+# CONFIG_SENSORS_ASC7621 is not set
+# CONFIG_SENSORS_AXI_FAN_CONTROL is not set
+# CONFIG_SENSORS_ATXP1 is not set
+# CONFIG_SENSORS_CORSAIR_CPRO is not set
+# CONFIG_SENSORS_CORSAIR_PSU is not set
+# CONFIG_SENSORS_DS620 is not set
+# CONFIG_SENSORS_DS1621 is not set
+# CONFIG_SENSORS_I5K_AMB is not set
+# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_F71882FG is not set
+# CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_FTSTEUTATES is not set
+# CONFIG_SENSORS_GL518SM is not set
+# CONFIG_SENSORS_GL520SM is not set
+# CONFIG_SENSORS_G760A is not set
+# CONFIG_SENSORS_G762 is not set
+CONFIG_SENSORS_GPIO_FAN=m
+# CONFIG_SENSORS_HIH6130 is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_JC42 is not set
+# CONFIG_SENSORS_POWR1220 is not set
+# CONFIG_SENSORS_LINEAGE is not set
+# CONFIG_SENSORS_LTC2945 is not set
+# CONFIG_SENSORS_LTC2947_I2C is not set
+# CONFIG_SENSORS_LTC2947_SPI is not set
+# CONFIG_SENSORS_LTC2990 is not set
+# CONFIG_SENSORS_LTC2992 is not set
+# CONFIG_SENSORS_LTC4151 is not set
+# CONFIG_SENSORS_LTC4215 is not set
+# CONFIG_SENSORS_LTC4222 is not set
+# CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LTC4260 is not set
+# CONFIG_SENSORS_LTC4261 is not set
+# CONFIG_SENSORS_MAX1111 is not set
+# CONFIG_SENSORS_MAX127 is not set
+# CONFIG_SENSORS_MAX16065 is not set
+# CONFIG_SENSORS_MAX1619 is not set
+# CONFIG_SENSORS_MAX1668 is not set
+# CONFIG_SENSORS_MAX197 is not set
+# CONFIG_SENSORS_MAX31722 is not set
+# CONFIG_SENSORS_MAX31730 is not set
+# CONFIG_SENSORS_MAX31760 is not set
+# CONFIG_SENSORS_MAX6620 is not set
+# CONFIG_SENSORS_MAX6621 is not set
+# CONFIG_SENSORS_MAX6639 is not set
+# CONFIG_SENSORS_MAX6642 is not set
+# CONFIG_SENSORS_MAX6650 is not set
+# CONFIG_SENSORS_MAX6697 is not set
+# CONFIG_SENSORS_MAX31790 is not set
+# CONFIG_SENSORS_MCP3021 is not set
+# CONFIG_SENSORS_TC654 is not set
+# CONFIG_SENSORS_TPS23861 is not set
+# CONFIG_SENSORS_MR75203 is not set
+# CONFIG_SENSORS_ADCXX is not set
+# CONFIG_SENSORS_LM63 is not set
+# CONFIG_SENSORS_LM70 is not set
+# CONFIG_SENSORS_LM73 is not set
+# CONFIG_SENSORS_LM75 is not set
+# CONFIG_SENSORS_LM77 is not set
+# CONFIG_SENSORS_LM78 is not set
+# CONFIG_SENSORS_LM80 is not set
+# CONFIG_SENSORS_LM83 is not set
+# CONFIG_SENSORS_LM85 is not set
+# CONFIG_SENSORS_LM87 is not set
+# CONFIG_SENSORS_LM90 is not set
+# CONFIG_SENSORS_LM92 is not set
+# CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LM95234 is not set
+# CONFIG_SENSORS_LM95241 is not set
+# CONFIG_SENSORS_LM95245 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_NCT6683 is not set
+# CONFIG_SENSORS_NCT6775 is not set
+# CONFIG_SENSORS_NCT6775_I2C is not set
+# CONFIG_SENSORS_NCT7802 is not set
+# CONFIG_SENSORS_NCT7904 is not set
+# CONFIG_SENSORS_NPCM7XX is not set
+# CONFIG_SENSORS_NZXT_KRAKEN2 is not set
+# CONFIG_SENSORS_NZXT_SMART2 is not set
+# CONFIG_SENSORS_OCC_P8_I2C is not set
+# CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_PMBUS is not set
+CONFIG_SENSORS_PWM_FAN=m
+CONFIG_SENSORS_RASPBERRYPI_HWMON=y
+# CONFIG_SENSORS_SBTSI is not set
+# CONFIG_SENSORS_SBRMI is not set
+# CONFIG_SENSORS_SHT15 is not set
+# CONFIG_SENSORS_SHT21 is not set
+# CONFIG_SENSORS_SHT3x is not set
+# CONFIG_SENSORS_SHT4x is not set
+# CONFIG_SENSORS_SHTC1 is not set
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_DME1737 is not set
+# CONFIG_SENSORS_EMC1403 is not set
+# CONFIG_SENSORS_EMC2103 is not set
+# CONFIG_SENSORS_EMC2305 is not set
+# CONFIG_SENSORS_EMC6W201 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47M192 is not set
+# CONFIG_SENSORS_SMSC47B397 is not set
+# CONFIG_SENSORS_SCH5627 is not set
+# CONFIG_SENSORS_SCH5636 is not set
+# CONFIG_SENSORS_STTS751 is not set
+# CONFIG_SENSORS_SMM665 is not set
+# CONFIG_SENSORS_ADC128D818 is not set
+# CONFIG_SENSORS_ADS7828 is not set
+# CONFIG_SENSORS_ADS7871 is not set
+# CONFIG_SENSORS_AMC6821 is not set
+# CONFIG_SENSORS_INA209 is not set
+# CONFIG_SENSORS_INA2XX is not set
+# CONFIG_SENSORS_INA238 is not set
+# CONFIG_SENSORS_INA3221 is not set
+# CONFIG_SENSORS_TC74 is not set
+# CONFIG_SENSORS_THMC50 is not set
+# CONFIG_SENSORS_TMP102 is not set
+# CONFIG_SENSORS_TMP103 is not set
+# CONFIG_SENSORS_TMP108 is not set
+# CONFIG_SENSORS_TMP401 is not set
+# CONFIG_SENSORS_TMP421 is not set
+# CONFIG_SENSORS_TMP464 is not set
+# CONFIG_SENSORS_TMP513 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_VT8231 is not set
+# CONFIG_SENSORS_W83773G is not set
+# CONFIG_SENSORS_W83781D is not set
+# CONFIG_SENSORS_W83791D is not set
+# CONFIG_SENSORS_W83792D is not set
+# CONFIG_SENSORS_W83793 is not set
+# CONFIG_SENSORS_W83795 is not set
+# CONFIG_SENSORS_W83L785TS is not set
+# CONFIG_SENSORS_W83L786NG is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+CONFIG_SENSORS_RP1_ADC=m
+CONFIG_THERMAL=y
+# CONFIG_THERMAL_NETLINK is not set
+# CONFIG_THERMAL_STATISTICS is not set
+CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0
+CONFIG_THERMAL_HWMON=y
+CONFIG_THERMAL_OF=y
+# CONFIG_THERMAL_WRITABLE_TRIPS is not set
+CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y
+# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set
+# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set
+# CONFIG_THERMAL_GOV_FAIR_SHARE is not set
+CONFIG_THERMAL_GOV_STEP_WISE=y
+# CONFIG_THERMAL_GOV_BANG_BANG is not set
+# CONFIG_THERMAL_GOV_USER_SPACE is not set
+# CONFIG_CPU_THERMAL is not set
+# CONFIG_THERMAL_EMULATION is not set
+# CONFIG_THERMAL_MMIO is not set
+
+#
+# Broadcom thermal drivers
+#
+CONFIG_BCM2711_THERMAL=y
+CONFIG_BCM2835_THERMAL=y
+# CONFIG_BRCMSTB_THERMAL is not set
+# end of Broadcom thermal drivers
+
+CONFIG_WATCHDOG=y
+CONFIG_WATCHDOG_CORE=y
+# CONFIG_WATCHDOG_NOWAYOUT is not set
+CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y
+CONFIG_WATCHDOG_OPEN_TIMEOUT=0
+# CONFIG_WATCHDOG_SYSFS is not set
+# CONFIG_WATCHDOG_HRTIMER_PRETIMEOUT is not set
+
+#
+# Watchdog Pretimeout Governors
+#
+# CONFIG_WATCHDOG_PRETIMEOUT_GOV is not set
+
+#
+# Watchdog Device Drivers
+#
+# CONFIG_SOFT_WATCHDOG is not set
+# CONFIG_GPIO_WATCHDOG is not set
+# CONFIG_XILINX_WATCHDOG is not set
+# CONFIG_ZIIRAVE_WATCHDOG is not set
+# CONFIG_ARM_SP805_WATCHDOG is not set
+# CONFIG_ARM_SBSA_WATCHDOG is not set
+# CONFIG_CADENCE_WATCHDOG is not set
+# CONFIG_DW_WATCHDOG is not set
+# CONFIG_MAX63XX_WATCHDOG is not set
+# CONFIG_ARM_SMC_WATCHDOG is not set
+# CONFIG_ALIM7101_WDT is not set
+# CONFIG_I6300ESB_WDT is not set
+# CONFIG_HP_WATCHDOG is not set
+CONFIG_BCM2835_WDT=y
+# CONFIG_BCM7038_WDT is not set
+# CONFIG_MEN_A21_WDT is not set
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+
+#
+# USB-based Watchdog Cards
+#
+# CONFIG_USBPCWATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+CONFIG_SSB=m
+CONFIG_SSB_SPROM=y
+CONFIG_SSB_BLOCKIO=y
+CONFIG_SSB_PCIHOST_POSSIBLE=y
+CONFIG_SSB_PCIHOST=y
+CONFIG_SSB_B43_PCI_BRIDGE=y
+CONFIG_SSB_SDIOHOST_POSSIBLE=y
+# CONFIG_SSB_SDIOHOST is not set
+CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y
+CONFIG_SSB_DRIVER_PCICORE=y
+# CONFIG_SSB_DRIVER_GPIO is not set
+CONFIG_BCMA_POSSIBLE=y
+CONFIG_BCMA=m
+CONFIG_BCMA_BLOCKIO=y
+CONFIG_BCMA_HOST_PCI_POSSIBLE=y
+CONFIG_BCMA_HOST_PCI=y
+# CONFIG_BCMA_HOST_SOC is not set
+CONFIG_BCMA_DRIVER_PCI=y
+CONFIG_BCMA_DRIVER_GMAC_CMN=y
+# CONFIG_BCMA_DRIVER_GPIO is not set
+# CONFIG_BCMA_DEBUG is not set
+
+#
+# Multifunction device drivers
+#
+CONFIG_MFD_CORE=y
+# CONFIG_MFD_RPISENSE_CORE is not set
+# CONFIG_MFD_ACT8945A is not set
+# CONFIG_MFD_AS3711 is not set
+# CONFIG_MFD_AS3722 is not set
+# CONFIG_PMIC_ADP5520 is not set
+# CONFIG_MFD_AAT2870_CORE is not set
+# CONFIG_MFD_ATMEL_FLEXCOM is not set
+# CONFIG_MFD_ATMEL_HLCDC is not set
+# CONFIG_MFD_BCM590XX is not set
+# CONFIG_MFD_BD9571MWV is not set
+# CONFIG_MFD_AXP20X_I2C is not set
+# CONFIG_MFD_MADERA is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_DA9052_SPI is not set
+# CONFIG_MFD_DA9052_I2C is not set
+# CONFIG_MFD_DA9055 is not set
+# CONFIG_MFD_DA9062 is not set
+# CONFIG_MFD_DA9063 is not set
+# CONFIG_MFD_DA9150 is not set
+# CONFIG_MFD_DLN2 is not set
+# CONFIG_MFD_GATEWORKS_GSC is not set
+# CONFIG_MFD_MC13XXX_SPI is not set
+# CONFIG_MFD_MC13XXX_I2C is not set
+# CONFIG_MFD_MP2629 is not set
+# CONFIG_MFD_HI6421_PMIC is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_HTC_I2CPLD is not set
+# CONFIG_LPC_ICH is not set
+# CONFIG_LPC_SCH is not set
+# CONFIG_MFD_IQS62X is not set
+# CONFIG_MFD_JANZ_CMODIO is not set
+# CONFIG_MFD_KEMPLD is not set
+# CONFIG_MFD_88PM800 is not set
+# CONFIG_MFD_88PM805 is not set
+# CONFIG_MFD_88PM860X is not set
+# CONFIG_MFD_MAX14577 is not set
+# CONFIG_MFD_MAX77620 is not set
+# CONFIG_MFD_MAX77650 is not set
+# CONFIG_MFD_MAX77686 is not set
+# CONFIG_MFD_MAX77693 is not set
+# CONFIG_MFD_MAX77714 is not set
+# CONFIG_MFD_MAX77843 is not set
+# CONFIG_MFD_MAX8907 is not set
+# CONFIG_MFD_MAX8925 is not set
+# CONFIG_MFD_MAX8997 is not set
+# CONFIG_MFD_MAX8998 is not set
+# CONFIG_MFD_MT6360 is not set
+# CONFIG_MFD_MT6370 is not set
+# CONFIG_MFD_MT6397 is not set
+# CONFIG_MFD_MENF21BMC is not set
+# CONFIG_MFD_OCELOT is not set
+# CONFIG_EZX_PCAP is not set
+# CONFIG_MFD_CPCAP is not set
+# CONFIG_MFD_VIPERBOARD is not set
+# CONFIG_MFD_NTXEC is not set
+# CONFIG_MFD_RETU is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_MFD_SY7636A is not set
+CONFIG_MFD_RASPBERRYPI_POE_HAT=m
+# CONFIG_MFD_RDC321X is not set
+# CONFIG_MFD_RT4831 is not set
+# CONFIG_MFD_RT5033 is not set
+# CONFIG_MFD_RT5120 is not set
+# CONFIG_MFD_RC5T583 is not set
+# CONFIG_MFD_RK808 is not set
+# CONFIG_MFD_RN5T618 is not set
+# CONFIG_MFD_SEC_CORE is not set
+# CONFIG_MFD_SI476X_CORE is not set
+CONFIG_MFD_SIMPLE_MFD_I2C=m
+# CONFIG_MFD_SM501 is not set
+# CONFIG_MFD_SKY81452 is not set
+# CONFIG_MFD_STMPE is not set
+CONFIG_MFD_SYSCON=y
+# CONFIG_MFD_TI_AM335X_TSCADC is not set
+# CONFIG_MFD_LP3943 is not set
+# CONFIG_MFD_LP8788 is not set
+# CONFIG_MFD_TI_LMU is not set
+# CONFIG_MFD_PALMAS is not set
+# CONFIG_TPS6105X is not set
+# CONFIG_TPS65010 is not set
+# CONFIG_TPS6507X is not set
+# CONFIG_MFD_TPS65086 is not set
+# CONFIG_MFD_TPS65090 is not set
+# CONFIG_MFD_TPS65217 is not set
+# CONFIG_MFD_TI_LP873X is not set
+# CONFIG_MFD_TI_LP87565 is not set
+# CONFIG_MFD_TPS65218 is not set
+# CONFIG_MFD_TPS6586X is not set
+# CONFIG_MFD_TPS65910 is not set
+# CONFIG_MFD_TPS65912_I2C is not set
+# CONFIG_MFD_TPS65912_SPI is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_TWL6040_CORE is not set
+# CONFIG_MFD_WL1273_CORE is not set
+# CONFIG_MFD_LM3533 is not set
+# CONFIG_MFD_TC3589X is not set
+# CONFIG_MFD_TQMX86 is not set
+# CONFIG_MFD_VX855 is not set
+# CONFIG_MFD_LOCHNAGAR is not set
+CONFIG_MFD_ARIZONA=m
+CONFIG_MFD_ARIZONA_I2C=m
+CONFIG_MFD_ARIZONA_SPI=m
+# CONFIG_MFD_CS47L24 is not set
+CONFIG_MFD_WM5102=y
+# CONFIG_MFD_WM5110 is not set
+# CONFIG_MFD_WM8997 is not set
+# CONFIG_MFD_WM8998 is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM831X_I2C is not set
+# CONFIG_MFD_WM831X_SPI is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_WM8994 is not set
+# CONFIG_MFD_ROHM_BD718XX is not set
+# CONFIG_MFD_ROHM_BD71828 is not set
+# CONFIG_MFD_ROHM_BD957XMUF is not set
+# CONFIG_MFD_STPMIC1 is not set
+# CONFIG_MFD_STMFX is not set
+# CONFIG_MFD_ATC260X_I2C is not set
+# CONFIG_MFD_QCOM_PM8008 is not set
+# CONFIG_RAVE_SP_CORE is not set
+# CONFIG_MFD_INTEL_M10_BMC is not set
+CONFIG_MFD_RP1=y
+# CONFIG_MFD_RSMU_I2C is not set
+# CONFIG_MFD_RSMU_SPI is not set
+# end of Multifunction device drivers
+
+CONFIG_REGULATOR=y
+# CONFIG_REGULATOR_DEBUG is not set
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
+# CONFIG_REGULATOR_VIRTUAL_CONSUMER is not set
+# CONFIG_REGULATOR_USERSPACE_CONSUMER is not set
+# CONFIG_REGULATOR_88PG86X is not set
+# CONFIG_REGULATOR_ACT8865 is not set
+# CONFIG_REGULATOR_AD5398 is not set
+CONFIG_REGULATOR_ARIZONA_LDO1=m
+CONFIG_REGULATOR_ARIZONA_MICSUPP=m
+# CONFIG_REGULATOR_DA9121 is not set
+# CONFIG_REGULATOR_DA9210 is not set
+# CONFIG_REGULATOR_DA9211 is not set
+# CONFIG_REGULATOR_FAN53555 is not set
+# CONFIG_REGULATOR_FAN53880 is not set
+CONFIG_REGULATOR_GPIO=y
+# CONFIG_REGULATOR_ISL9305 is not set
+# CONFIG_REGULATOR_ISL6271A is not set
+# CONFIG_REGULATOR_LP3971 is not set
+# CONFIG_REGULATOR_LP3972 is not set
+# CONFIG_REGULATOR_LP872X is not set
+# CONFIG_REGULATOR_LP8755 is not set
+# CONFIG_REGULATOR_LTC3589 is not set
+# CONFIG_REGULATOR_LTC3676 is not set
+# CONFIG_REGULATOR_MAX1586 is not set
+# CONFIG_REGULATOR_MAX8649 is not set
+# CONFIG_REGULATOR_MAX8660 is not set
+# CONFIG_REGULATOR_MAX8893 is not set
+# CONFIG_REGULATOR_MAX8952 is not set
+# CONFIG_REGULATOR_MAX8973 is not set
+# CONFIG_REGULATOR_MAX20086 is not set
+# CONFIG_REGULATOR_MAX77826 is not set
+# CONFIG_REGULATOR_MCP16502 is not set
+# CONFIG_REGULATOR_MP5416 is not set
+# CONFIG_REGULATOR_MP8859 is not set
+# CONFIG_REGULATOR_MP886X is not set
+# CONFIG_REGULATOR_MPQ7920 is not set
+# CONFIG_REGULATOR_MT6311 is not set
+# CONFIG_REGULATOR_PCA9450 is not set
+# CONFIG_REGULATOR_PF8X00 is not set
+# CONFIG_REGULATOR_PFUZE100 is not set
+# CONFIG_REGULATOR_PV88060 is not set
+# CONFIG_REGULATOR_PV88080 is not set
+# CONFIG_REGULATOR_PV88090 is not set
+# CONFIG_REGULATOR_PWM is not set
+CONFIG_REGULATOR_RASPBERRYPI_TOUCHSCREEN_ATTINY=y
+# CONFIG_REGULATOR_RT4801 is not set
+# CONFIG_REGULATOR_RT5190A is not set
+# CONFIG_REGULATOR_RT5759 is not set
+# CONFIG_REGULATOR_RT6160 is not set
+# CONFIG_REGULATOR_RT6245 is not set
+# CONFIG_REGULATOR_RTQ2134 is not set
+# CONFIG_REGULATOR_RTMV20 is not set
+# CONFIG_REGULATOR_RTQ6752 is not set
+# CONFIG_REGULATOR_SLG51000 is not set
+# CONFIG_REGULATOR_SY8106A is not set
+# CONFIG_REGULATOR_SY8824X is not set
+# CONFIG_REGULATOR_SY8827N is not set
+# CONFIG_REGULATOR_TPS51632 is not set
+# CONFIG_REGULATOR_TPS62360 is not set
+# CONFIG_REGULATOR_TPS6286X is not set
+# CONFIG_REGULATOR_TPS65023 is not set
+# CONFIG_REGULATOR_TPS6507X is not set
+# CONFIG_REGULATOR_TPS65132 is not set
+# CONFIG_REGULATOR_TPS6524X is not set
+# CONFIG_REGULATOR_VCTRL is not set
+CONFIG_RC_CORE=y
+CONFIG_BPF_LIRC_MODE2=y
+CONFIG_LIRC=y
+CONFIG_RC_MAP=m
+CONFIG_RC_DECODERS=y
+CONFIG_IR_IMON_DECODER=m
+CONFIG_IR_JVC_DECODER=m
+CONFIG_IR_MCE_KBD_DECODER=m
+CONFIG_IR_NEC_DECODER=m
+CONFIG_IR_RC5_DECODER=m
+CONFIG_IR_RC6_DECODER=m
+CONFIG_IR_RCMM_DECODER=m
+CONFIG_IR_SANYO_DECODER=m
+CONFIG_IR_SHARP_DECODER=m
+CONFIG_IR_SONY_DECODER=m
+CONFIG_IR_XMP_DECODER=m
+CONFIG_RC_DEVICES=y
+CONFIG_IR_GPIO_CIR=m
+CONFIG_IR_GPIO_TX=m
+# CONFIG_IR_HIX5HD2 is not set
+CONFIG_IR_IGORPLUGUSB=m
+CONFIG_IR_IGUANA=m
+CONFIG_IR_IMON=m
+CONFIG_IR_IMON_RAW=m
+CONFIG_IR_MCEUSB=m
+CONFIG_IR_PWM_TX=m
+CONFIG_IR_REDRAT3=m
+# CONFIG_IR_SERIAL is not set
+# CONFIG_IR_SPI is not set
+CONFIG_IR_STREAMZAP=m
+CONFIG_IR_TOY=m
+CONFIG_IR_TTUSBIR=m
+CONFIG_RC_ATI_REMOTE=m
+# CONFIG_RC_LOOPBACK is not set
+CONFIG_RC_XBOX_DVD=m
+CONFIG_CEC_CORE=y
+
+#
+# CEC support
+#
+# CONFIG_MEDIA_CEC_RC is not set
+# CONFIG_MEDIA_CEC_SUPPORT is not set
+# end of CEC support
+
+CONFIG_MEDIA_SUPPORT=m
+# CONFIG_MEDIA_SUPPORT_FILTER is not set
+CONFIG_MEDIA_SUBDRV_AUTOSELECT=y
+
+#
+# Media device types
+#
+CONFIG_MEDIA_CAMERA_SUPPORT=y
+CONFIG_MEDIA_ANALOG_TV_SUPPORT=y
+CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y
+CONFIG_MEDIA_RADIO_SUPPORT=y
+CONFIG_MEDIA_SDR_SUPPORT=y
+CONFIG_MEDIA_PLATFORM_SUPPORT=y
+CONFIG_MEDIA_TEST_SUPPORT=y
+# end of Media device types
+
+#
+# Media core support
+#
+CONFIG_VIDEO_DEV=m
+CONFIG_MEDIA_CONTROLLER=y
+CONFIG_DVB_CORE=m
+# end of Media core support
+
+#
+# Video4Linux options
+#
+CONFIG_VIDEO_V4L2_I2C=y
+CONFIG_VIDEO_V4L2_SUBDEV_API=y
+# CONFIG_VIDEO_ADV_DEBUG is not set
+# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
+CONFIG_VIDEO_TUNER=m
+CONFIG_V4L2_MEM2MEM_DEV=m
+# CONFIG_V4L2_FLASH_LED_CLASS is not set
+CONFIG_V4L2_FWNODE=m
+CONFIG_V4L2_ASYNC=m
+# end of Video4Linux options
+
+#
+# Media controller options
+#
+CONFIG_MEDIA_CONTROLLER_DVB=y
+CONFIG_MEDIA_CONTROLLER_REQUEST_API=y
+# end of Media controller options
+
+#
+# Digital TV options
+#
+# CONFIG_DVB_MMAP is not set
+CONFIG_DVB_NET=y
+CONFIG_DVB_MAX_ADAPTERS=8
+# CONFIG_DVB_DYNAMIC_MINORS is not set
+# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set
+# CONFIG_DVB_ULE_DEBUG is not set
+# end of Digital TV options
+
+#
+# Media drivers
+#
+
+#
+# Media drivers
+#
+CONFIG_MEDIA_USB_SUPPORT=y
+
+#
+# Webcam devices
+#
+# CONFIG_USB_GSPCA is not set
+# CONFIG_USB_PWC is not set
+# CONFIG_USB_S2255 is not set
+CONFIG_VIDEO_USBTV=m
+CONFIG_USB_VIDEO_CLASS=m
+# CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV is not set
+
+#
+# Analog TV USB devices
+#
+# CONFIG_VIDEO_GO7007 is not set
+CONFIG_VIDEO_HDPVR=m
+CONFIG_VIDEO_PVRUSB2=m
+CONFIG_VIDEO_PVRUSB2_SYSFS=y
+CONFIG_VIDEO_PVRUSB2_DVB=y
+# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set
+CONFIG_VIDEO_STK1160_COMMON=m
+CONFIG_VIDEO_STK1160=m
+
+#
+# Analog/digital TV USB devices
+#
+CONFIG_VIDEO_AU0828=m
+CONFIG_VIDEO_AU0828_V4L2=y
+CONFIG_VIDEO_AU0828_RC=y
+CONFIG_VIDEO_CX231XX=m
+CONFIG_VIDEO_CX231XX_RC=y
+# CONFIG_VIDEO_CX231XX_ALSA is not set
+CONFIG_VIDEO_CX231XX_DVB=m
+
+#
+# Digital TV USB devices
+#
+CONFIG_DVB_AS102=m
+CONFIG_DVB_B2C2_FLEXCOP_USB=m
+# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set
+CONFIG_DVB_USB_V2=m
+CONFIG_DVB_USB_AF9015=m
+CONFIG_DVB_USB_AF9035=m
+CONFIG_DVB_USB_ANYSEE=m
+CONFIG_DVB_USB_AU6610=m
+CONFIG_DVB_USB_AZ6007=m
+CONFIG_DVB_USB_CE6230=m
+CONFIG_DVB_USB_DVBSKY=m
+CONFIG_DVB_USB_EC168=m
+CONFIG_DVB_USB_GL861=m
+CONFIG_DVB_USB_LME2510=m
+CONFIG_DVB_USB_MXL111SF=m
+CONFIG_DVB_USB_RTL28XXU=m
+CONFIG_DVB_USB_ZD1301=m
+CONFIG_DVB_USB=m
+# CONFIG_DVB_USB_DEBUG is not set
+CONFIG_DVB_USB_A800=m
+CONFIG_DVB_USB_AF9005=m
+CONFIG_DVB_USB_AF9005_REMOTE=m
+CONFIG_DVB_USB_AZ6027=m
+CONFIG_DVB_USB_CINERGY_T2=m
+CONFIG_DVB_USB_CXUSB=m
+# CONFIG_DVB_USB_CXUSB_ANALOG is not set
+CONFIG_DVB_USB_DIB0700=m
+CONFIG_DVB_USB_DIB3000MC=m
+CONFIG_DVB_USB_DIBUSB_MB=m
+CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y
+CONFIG_DVB_USB_DIBUSB_MC=m
+CONFIG_DVB_USB_DIGITV=m
+CONFIG_DVB_USB_DTT200U=m
+CONFIG_DVB_USB_DTV5100=m
+CONFIG_DVB_USB_DW2102=m
+CONFIG_DVB_USB_GP8PSK=m
+CONFIG_DVB_USB_M920X=m
+CONFIG_DVB_USB_NOVA_T_USB2=m
+CONFIG_DVB_USB_OPERA1=m
+CONFIG_DVB_USB_PCTV452E=m
+CONFIG_DVB_USB_TECHNISAT_USB2=m
+CONFIG_DVB_USB_TTUSB2=m
+CONFIG_DVB_USB_UMT_010=m
+CONFIG_DVB_USB_VP702X=m
+CONFIG_DVB_USB_VP7045=m
+CONFIG_SMS_USB_DRV=m
+# CONFIG_DVB_TTUSB_BUDGET is not set
+# CONFIG_DVB_TTUSB_DEC is not set
+
+#
+# Webcam, TV (analog/digital) USB devices
+#
+CONFIG_VIDEO_EM28XX=m
+# CONFIG_VIDEO_EM28XX_V4L2 is not set
+# CONFIG_VIDEO_EM28XX_ALSA is not set
+CONFIG_VIDEO_EM28XX_DVB=m
+CONFIG_VIDEO_EM28XX_RC=m
+
+#
+# Software defined radio USB devices
+#
+# CONFIG_USB_AIRSPY is not set
+# CONFIG_USB_HACKRF is not set
+# CONFIG_USB_MSI2500 is not set
+# CONFIG_MEDIA_PCI_SUPPORT is not set
+# CONFIG_RADIO_ADAPTERS is not set
+CONFIG_MEDIA_PLATFORM_DRIVERS=y
+# CONFIG_V4L_PLATFORM_DRIVERS is not set
+# CONFIG_SDR_PLATFORM_DRIVERS is not set
+# CONFIG_DVB_PLATFORM_DRIVERS is not set
+# CONFIG_V4L_MEM2MEM_DRIVERS is not set
+
+#
+# Allegro DVT media platform drivers
+#
+
+#
+# Amlogic media platform drivers
+#
+
+#
+# Amphion drivers
+#
+
+#
+# Aspeed media platform drivers
+#
+
+#
+# Atmel media platform drivers
+#
+# CONFIG_VIDEO_BCM2835_UNICAM is not set
+
+#
+# Cadence media platform drivers
+#
+# CONFIG_VIDEO_CADENCE_CSI2RX is not set
+# CONFIG_VIDEO_CADENCE_CSI2TX is not set
+
+#
+# Chips&Media media platform drivers
+#
+
+#
+# Intel media platform drivers
+#
+
+#
+# Marvell media platform drivers
+#
+
+#
+# Mediatek media platform drivers
+#
+
+#
+# NVidia media platform drivers
+#
+
+#
+# NXP media platform drivers
+#
+
+#
+# Qualcomm media platform drivers
+#
+
+#
+# Raspberry Pi media platform drivers
+#
+CONFIG_VIDEO_RASPBERRYPI_PISP_BE=m
+CONFIG_VIDEO_RP1_CFE=m
+
+#
+# Renesas media platform drivers
+#
+
+#
+# Rockchip media platform drivers
+#
+
+#
+# Samsung media platform drivers
+#
+
+#
+# STMicroelectronics media platform drivers
+#
+
+#
+# Sunxi media platform drivers
+#
+
+#
+# Texas Instruments drivers
+#
+
+#
+# Verisilicon media platform drivers
+#
+
+#
+# VIA media platform drivers
+#
+
+#
+# Xilinx media platform drivers
+#
+
+#
+# MMC/SDIO DVB adapters
+#
+CONFIG_SMS_SDIO_DRV=m
+# CONFIG_V4L_TEST_DRIVERS is not set
+# CONFIG_DVB_TEST_DRIVERS is not set
+CONFIG_MEDIA_COMMON_OPTIONS=y
+
+#
+# common driver options
+#
+CONFIG_CYPRESS_FIRMWARE=m
+CONFIG_TTPCI_EEPROM=m
+CONFIG_VIDEO_CX2341X=m
+CONFIG_VIDEO_TVEEPROM=m
+CONFIG_DVB_B2C2_FLEXCOP=m
+CONFIG_SMS_SIANO_MDTV=m
+CONFIG_SMS_SIANO_RC=y
+# CONFIG_SMS_SIANO_DEBUGFS is not set
+CONFIG_VIDEOBUF2_CORE=m
+CONFIG_VIDEOBUF2_V4L2=m
+CONFIG_VIDEOBUF2_MEMOPS=m
+CONFIG_VIDEOBUF2_DMA_CONTIG=m
+CONFIG_VIDEOBUF2_VMALLOC=m
+# end of Media drivers
+
+#
+# Media ancillary drivers
+#
+CONFIG_MEDIA_ATTACH=y
+
+#
+# IR I2C driver auto-selected by 'Autoselect ancillary drivers'
+#
+CONFIG_VIDEO_IR_I2C=m
+CONFIG_VIDEO_CAMERA_SENSOR=y
+# CONFIG_VIDEO_AR0521 is not set
+# CONFIG_VIDEO_ARDUCAM_64MP is not set
+# CONFIG_VIDEO_ARDUCAM_PIVARIETY is not set
+# CONFIG_VIDEO_HI556 is not set
+# CONFIG_VIDEO_HI846 is not set
+# CONFIG_VIDEO_HI847 is not set
+# CONFIG_VIDEO_IMX208 is not set
+# CONFIG_VIDEO_IMX214 is not set
+# CONFIG_VIDEO_IMX219 is not set
+# CONFIG_VIDEO_IMX258 is not set
+# CONFIG_VIDEO_IMX274 is not set
+# CONFIG_VIDEO_IMX290 is not set
+# CONFIG_VIDEO_IMX296 is not set
+# CONFIG_VIDEO_IMX319 is not set
+# CONFIG_VIDEO_IMX334 is not set
+# CONFIG_VIDEO_IMX335 is not set
+# CONFIG_VIDEO_IMX355 is not set
+# CONFIG_VIDEO_IMX412 is not set
+# CONFIG_VIDEO_IMX477 is not set
+# CONFIG_VIDEO_IMX519 is not set
+# CONFIG_VIDEO_IMX708 is not set
+# CONFIG_VIDEO_MT9M001 is not set
+# CONFIG_VIDEO_MT9M032 is not set
+# CONFIG_VIDEO_MT9M111 is not set
+# CONFIG_VIDEO_MT9P031 is not set
+# CONFIG_VIDEO_MT9T001 is not set
+# CONFIG_VIDEO_MT9T112 is not set
+# CONFIG_VIDEO_MT9V011 is not set
+# CONFIG_VIDEO_MT9V032 is not set
+# CONFIG_VIDEO_MT9V111 is not set
+# CONFIG_VIDEO_NOON010PC30 is not set
+# CONFIG_VIDEO_OG01A1B is not set
+# CONFIG_VIDEO_OV02A10 is not set
+# CONFIG_VIDEO_OV08D10 is not set
+# CONFIG_VIDEO_OV13858 is not set
+# CONFIG_VIDEO_OV13B10 is not set
+# CONFIG_VIDEO_OV2311 is not set
+# CONFIG_VIDEO_OV2640 is not set
+# CONFIG_VIDEO_OV2659 is not set
+# CONFIG_VIDEO_OV2680 is not set
+# CONFIG_VIDEO_OV2685 is not set
+# CONFIG_VIDEO_OV5640 is not set
+# CONFIG_VIDEO_OV5645 is not set
+# CONFIG_VIDEO_OV5647 is not set
+# CONFIG_VIDEO_OV5648 is not set
+# CONFIG_VIDEO_OV5670 is not set
+# CONFIG_VIDEO_OV5675 is not set
+# CONFIG_VIDEO_OV5693 is not set
+# CONFIG_VIDEO_OV5695 is not set
+# CONFIG_VIDEO_OV6650 is not set
+# CONFIG_VIDEO_OV7251 is not set
+# CONFIG_VIDEO_OV7640 is not set
+# CONFIG_VIDEO_OV7670 is not set
+# CONFIG_VIDEO_OV772X is not set
+# CONFIG_VIDEO_OV7740 is not set
+# CONFIG_VIDEO_OV8856 is not set
+# CONFIG_VIDEO_OV8865 is not set
+# CONFIG_VIDEO_OV9282 is not set
+# CONFIG_VIDEO_OV9640 is not set
+# CONFIG_VIDEO_OV9650 is not set
+# CONFIG_VIDEO_RDACM20 is not set
+# CONFIG_VIDEO_RDACM21 is not set
+# CONFIG_VIDEO_RJ54N1 is not set
+# CONFIG_VIDEO_S5C73M3 is not set
+# CONFIG_VIDEO_S5K4ECGX is not set
+# CONFIG_VIDEO_S5K5BAF is not set
+# CONFIG_VIDEO_S5K6A3 is not set
+# CONFIG_VIDEO_S5K6AA is not set
+# CONFIG_VIDEO_SR030PC30 is not set
+# CONFIG_VIDEO_VS6624 is not set
+# CONFIG_VIDEO_CCS is not set
+# CONFIG_VIDEO_ET8EK8 is not set
+# CONFIG_VIDEO_M5MOLS is not set
+
+#
+# Lens drivers
+#
+# CONFIG_VIDEO_AD5398 is not set
+# CONFIG_VIDEO_AD5820 is not set
+# CONFIG_VIDEO_AK7375 is not set
+# CONFIG_VIDEO_DW9714 is not set
+# CONFIG_VIDEO_DW9768 is not set
+# CONFIG_VIDEO_DW9807_VCM is not set
+# end of Lens drivers
+
+#
+# Flash devices
+#
+# CONFIG_VIDEO_ADP1653 is not set
+# CONFIG_VIDEO_LM3560 is not set
+# CONFIG_VIDEO_LM3646 is not set
+# end of Flash devices
+
+#
+# Audio decoders, processors and mixers
+#
+# CONFIG_VIDEO_CS3308 is not set
+# CONFIG_VIDEO_CS5345 is not set
+CONFIG_VIDEO_CS53L32A=m
+CONFIG_VIDEO_MSP3400=m
+# CONFIG_VIDEO_SONY_BTF_MPX is not set
+# CONFIG_VIDEO_TDA1997X is not set
+# CONFIG_VIDEO_TDA7432 is not set
+# CONFIG_VIDEO_TDA9840 is not set
+# CONFIG_VIDEO_TEA6415C is not set
+# CONFIG_VIDEO_TEA6420 is not set
+# CONFIG_VIDEO_TLV320AIC23B is not set
+# CONFIG_VIDEO_TVAUDIO is not set
+# CONFIG_VIDEO_UDA1342 is not set
+# CONFIG_VIDEO_VP27SMPX is not set
+# CONFIG_VIDEO_WM8739 is not set
+CONFIG_VIDEO_WM8775=m
+# end of Audio decoders, processors and mixers
+
+#
+# RDS decoders
+#
+# CONFIG_VIDEO_SAA6588 is not set
+# end of RDS decoders
+
+#
+# Video decoders
+#
+# CONFIG_VIDEO_ADV7180 is not set
+# CONFIG_VIDEO_ADV7183 is not set
+# CONFIG_VIDEO_ADV748X is not set
+# CONFIG_VIDEO_ADV7604 is not set
+# CONFIG_VIDEO_ADV7842 is not set
+# CONFIG_VIDEO_BT819 is not set
+# CONFIG_VIDEO_BT856 is not set
+# CONFIG_VIDEO_BT866 is not set
+# CONFIG_VIDEO_ISL7998X is not set
+# CONFIG_VIDEO_KS0127 is not set
+# CONFIG_VIDEO_MAX9286 is not set
+# CONFIG_VIDEO_ML86V7667 is not set
+# CONFIG_VIDEO_SAA7110 is not set
+CONFIG_VIDEO_SAA711X=m
+# CONFIG_VIDEO_TC358743 is not set
+# CONFIG_VIDEO_TVP514X is not set
+# CONFIG_VIDEO_TVP5150 is not set
+# CONFIG_VIDEO_TVP7002 is not set
+# CONFIG_VIDEO_TW2804 is not set
+# CONFIG_VIDEO_OV9281 is not set
+# CONFIG_VIDEO_TW9903 is not set
+# CONFIG_VIDEO_TW9906 is not set
+# CONFIG_VIDEO_TW9910 is not set
+# CONFIG_VIDEO_IRS1125 is not set
+# CONFIG_VIDEO_VPX3220 is not set
+
+#
+# Video and audio decoders
+#
+# CONFIG_VIDEO_SAA717X is not set
+CONFIG_VIDEO_CX25840=m
+# end of Video decoders
+
+#
+# Video encoders
+#
+# CONFIG_VIDEO_AD9389B is not set
+# CONFIG_VIDEO_ADV7170 is not set
+# CONFIG_VIDEO_ADV7175 is not set
+# CONFIG_VIDEO_ADV7343 is not set
+# CONFIG_VIDEO_ADV7393 is not set
+# CONFIG_VIDEO_ADV7511 is not set
+# CONFIG_VIDEO_AK881X is not set
+# CONFIG_VIDEO_SAA7127 is not set
+# CONFIG_VIDEO_SAA7185 is not set
+# CONFIG_VIDEO_THS8200 is not set
+# end of Video encoders
+
+#
+# Video improvement chips
+#
+# CONFIG_VIDEO_UPD64031A is not set
+# CONFIG_VIDEO_UPD64083 is not set
+# end of Video improvement chips
+
+#
+# Audio/Video compression chips
+#
+# CONFIG_VIDEO_SAA6752HS is not set
+# end of Audio/Video compression chips
+
+#
+# SDR tuner chips
+#
+# CONFIG_SDR_MAX2175 is not set
+# end of SDR tuner chips
+
+#
+# Miscellaneous helper chips
+#
+# CONFIG_VIDEO_I2C is not set
+# CONFIG_VIDEO_M52790 is not set
+# CONFIG_VIDEO_ST_MIPID02 is not set
+# CONFIG_VIDEO_THS7303 is not set
+# end of Miscellaneous helper chips
+
+#
+# Media SPI Adapters
+#
+CONFIG_CXD2880_SPI_DRV=m
+# CONFIG_VIDEO_GS1662 is not set
+# end of Media SPI Adapters
+
+CONFIG_MEDIA_TUNER=m
+
+#
+# Customize TV tuners
+#
+CONFIG_MEDIA_TUNER_E4000=m
+CONFIG_MEDIA_TUNER_FC0011=m
+CONFIG_MEDIA_TUNER_FC0012=m
+CONFIG_MEDIA_TUNER_FC0013=m
+CONFIG_MEDIA_TUNER_FC2580=m
+CONFIG_MEDIA_TUNER_IT913X=m
+# CONFIG_MEDIA_TUNER_M88RS6000T is not set
+CONFIG_MEDIA_TUNER_MAX2165=m
+CONFIG_MEDIA_TUNER_MC44S803=m
+# CONFIG_MEDIA_TUNER_MSI001 is not set
+CONFIG_MEDIA_TUNER_MT2060=m
+CONFIG_MEDIA_TUNER_MT2063=m
+CONFIG_MEDIA_TUNER_MT20XX=m
+# CONFIG_MEDIA_TUNER_MT2131 is not set
+CONFIG_MEDIA_TUNER_MT2266=m
+# CONFIG_MEDIA_TUNER_MXL301RF is not set
+CONFIG_MEDIA_TUNER_MXL5005S=m
+CONFIG_MEDIA_TUNER_MXL5007T=m
+# CONFIG_MEDIA_TUNER_QM1D1B0004 is not set
+CONFIG_MEDIA_TUNER_QM1D1C0042=m
+CONFIG_MEDIA_TUNER_QT1010=m
+CONFIG_MEDIA_TUNER_R820T=m
+CONFIG_MEDIA_TUNER_SI2157=m
+CONFIG_MEDIA_TUNER_SIMPLE=m
+CONFIG_MEDIA_TUNER_TDA18212=m
+CONFIG_MEDIA_TUNER_TDA18218=m
+CONFIG_MEDIA_TUNER_TDA18250=m
+CONFIG_MEDIA_TUNER_TDA18271=m
+CONFIG_MEDIA_TUNER_TDA827X=m
+CONFIG_MEDIA_TUNER_TDA8290=m
+CONFIG_MEDIA_TUNER_TDA9887=m
+CONFIG_MEDIA_TUNER_TEA5761=m
+CONFIG_MEDIA_TUNER_TEA5767=m
+CONFIG_MEDIA_TUNER_TUA9001=m
+CONFIG_MEDIA_TUNER_XC2028=m
+CONFIG_MEDIA_TUNER_XC4000=m
+CONFIG_MEDIA_TUNER_XC5000=m
+# end of Customize TV tuners
+
+#
+# Customise DVB Frontends
+#
+
+#
+# Multistandard (satellite) frontends
+#
+CONFIG_DVB_M88DS3103=m
+# CONFIG_DVB_MXL5XX is not set
+CONFIG_DVB_STB0899=m
+CONFIG_DVB_STB6100=m
+CONFIG_DVB_STV090x=m
+# CONFIG_DVB_STV0910 is not set
+CONFIG_DVB_STV6110x=m
+# CONFIG_DVB_STV6111 is not set
+
+#
+# Multistandard (cable + terrestrial) frontends
+#
+CONFIG_DVB_DRXK=m
+CONFIG_DVB_MN88472=m
+CONFIG_DVB_MN88473=m
+CONFIG_DVB_SI2165=m
+CONFIG_DVB_TDA18271C2DD=m
+
+#
+# DVB-S (satellite) frontends
+#
+# CONFIG_DVB_CX24110 is not set
+CONFIG_DVB_CX24116=m
+# CONFIG_DVB_CX24117 is not set
+CONFIG_DVB_CX24120=m
+CONFIG_DVB_CX24123=m
+CONFIG_DVB_DS3000=m
+# CONFIG_DVB_MB86A16 is not set
+CONFIG_DVB_MT312=m
+CONFIG_DVB_S5H1420=m
+CONFIG_DVB_SI21XX=m
+CONFIG_DVB_STB6000=m
+CONFIG_DVB_STV0288=m
+CONFIG_DVB_STV0299=m
+CONFIG_DVB_STV0900=m
+CONFIG_DVB_STV6110=m
+CONFIG_DVB_TDA10071=m
+CONFIG_DVB_TDA10086=m
+# CONFIG_DVB_TDA8083 is not set
+# CONFIG_DVB_TDA8261 is not set
+CONFIG_DVB_TDA826X=m
+CONFIG_DVB_TS2020=m
+# CONFIG_DVB_TUA6100 is not set
+CONFIG_DVB_TUNER_CX24113=m
+CONFIG_DVB_TUNER_ITD1000=m
+# CONFIG_DVB_VES1X93 is not set
+# CONFIG_DVB_ZL10036 is not set
+CONFIG_DVB_ZL10039=m
+
+#
+# DVB-T (terrestrial) frontends
+#
+CONFIG_DVB_AF9013=m
+CONFIG_DVB_AS102_FE=m
+# CONFIG_DVB_CX22700 is not set
+CONFIG_DVB_CX22702=m
+CONFIG_DVB_CXD2820R=m
+CONFIG_DVB_CXD2841ER=m
+CONFIG_DVB_DIB3000MB=m
+CONFIG_DVB_DIB3000MC=m
+CONFIG_DVB_DIB7000M=m
+CONFIG_DVB_DIB7000P=m
+# CONFIG_DVB_DIB9000 is not set
+CONFIG_DVB_DRXD=m
+CONFIG_DVB_EC100=m
+CONFIG_DVB_GP8PSK_FE=m
+# CONFIG_DVB_L64781 is not set
+CONFIG_DVB_MT352=m
+CONFIG_DVB_NXT6000=m
+CONFIG_DVB_RTL2830=m
+CONFIG_DVB_RTL2832=m
+CONFIG_DVB_RTL2832_SDR=m
+# CONFIG_DVB_S5H1432 is not set
+CONFIG_DVB_SI2168=m
+# CONFIG_DVB_SP887X is not set
+# CONFIG_DVB_STV0367 is not set
+CONFIG_DVB_TDA10048=m
+CONFIG_DVB_TDA1004X=m
+CONFIG_DVB_ZD1301_DEMOD=m
+CONFIG_DVB_ZL10353=m
+CONFIG_DVB_CXD2880=m
+
+#
+# DVB-C (cable) frontends
+#
+CONFIG_DVB_STV0297=m
+# CONFIG_DVB_TDA10021 is not set
+CONFIG_DVB_TDA10023=m
+# CONFIG_DVB_VES1820 is not set
+
+#
+# ATSC (North American/Korean Terrestrial/Cable DTV) frontends
+#
+CONFIG_DVB_AU8522=m
+CONFIG_DVB_AU8522_DTV=m
+CONFIG_DVB_AU8522_V4L=m
+CONFIG_DVB_BCM3510=m
+CONFIG_DVB_LG2160=m
+CONFIG_DVB_LGDT3305=m
+CONFIG_DVB_LGDT3306A=m
+CONFIG_DVB_LGDT330X=m
+CONFIG_DVB_MXL692=m
+CONFIG_DVB_NXT200X=m
+# CONFIG_DVB_OR51132 is not set
+# CONFIG_DVB_OR51211 is not set
+CONFIG_DVB_S5H1409=m
+CONFIG_DVB_S5H1411=m
+
+#
+# ISDB-T (terrestrial) frontends
+#
+CONFIG_DVB_DIB8000=m
+CONFIG_DVB_MB86A20S=m
+CONFIG_DVB_S921=m
+
+#
+# ISDB-S (satellite) & ISDB-T (terrestrial) frontends
+#
+# CONFIG_DVB_MN88443X is not set
+CONFIG_DVB_TC90522=m
+
+#
+# Digital terrestrial only tuners/PLL
+#
+CONFIG_DVB_PLL=m
+CONFIG_DVB_TUNER_DIB0070=m
+CONFIG_DVB_TUNER_DIB0090=m
+
+#
+# SEC control devices for DVB-S
+#
+CONFIG_DVB_A8293=m
+CONFIG_DVB_AF9033=m
+# CONFIG_DVB_ASCOT2E is not set
+CONFIG_DVB_ATBM8830=m
+# CONFIG_DVB_HELENE is not set
+# CONFIG_DVB_HORUS3A is not set
+# CONFIG_DVB_ISL6405 is not set
+CONFIG_DVB_ISL6421=m
+CONFIG_DVB_ISL6423=m
+CONFIG_DVB_IX2505V=m
+# CONFIG_DVB_LGS8GL5 is not set
+CONFIG_DVB_LGS8GXX=m
+# CONFIG_DVB_LNBH25 is not set
+# CONFIG_DVB_LNBH29 is not set
+CONFIG_DVB_LNBP21=m
+CONFIG_DVB_LNBP22=m
+CONFIG_DVB_M88RS2000=m
+# CONFIG_DVB_TDA665x is not set
+CONFIG_DVB_DRX39XYJ=m
+
+#
+# Common Interface (EN50221) controller drivers
+#
+# CONFIG_DVB_CXD2099 is not set
+CONFIG_DVB_SP2=m
+# end of Customise DVB Frontends
+
+#
+# Tools to develop new frontends
+#
+# CONFIG_DVB_DUMMY_FE is not set
+# end of Media ancillary drivers
+
+#
+# Graphics support
+#
+CONFIG_DRM=y
+CONFIG_DRM_MIPI_DSI=y
+# CONFIG_DRM_DEBUG_MM is not set
+CONFIG_DRM_KMS_HELPER=y
+# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set
+# CONFIG_DRM_DEBUG_MODESET_LOCK is not set
+CONFIG_DRM_FBDEV_EMULATION=y
+CONFIG_DRM_FBDEV_OVERALLOC=100
+# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set
+CONFIG_DRM_LOAD_EDID_FIRMWARE=y
+CONFIG_DRM_DISPLAY_HELPER=y
+CONFIG_DRM_DISPLAY_HDMI_HELPER=y
+# CONFIG_DRM_DP_AUX_CHARDEV is not set
+# CONFIG_DRM_DP_CEC is not set
+CONFIG_DRM_TTM=y
+CONFIG_DRM_VRAM_HELPER=y
+CONFIG_DRM_TTM_HELPER=y
+CONFIG_DRM_GEM_DMA_HELPER=y
+CONFIG_DRM_GEM_SHMEM_HELPER=y
+CONFIG_DRM_SCHED=y
+
+#
+# I2C encoder or helper chips
+#
+# CONFIG_DRM_I2C_CH7006 is not set
+# CONFIG_DRM_I2C_SIL164 is not set
+# CONFIG_DRM_I2C_NXP_TDA998X is not set
+# CONFIG_DRM_I2C_NXP_TDA9950 is not set
+# end of I2C encoder or helper chips
+
+#
+# ARM devices
+#
+# CONFIG_DRM_HDLCD is not set
+# CONFIG_DRM_MALI_DISPLAY is not set
+# CONFIG_DRM_KOMEDA is not set
+# end of ARM devices
+
+# CONFIG_DRM_RADEON is not set
+# CONFIG_DRM_AMDGPU is not set
+# CONFIG_DRM_NOUVEAU is not set
+# CONFIG_DRM_VGEM is not set
+# CONFIG_DRM_VKMS is not set
+# CONFIG_DRM_VMWGFX is not set
+# CONFIG_DRM_UDL is not set
+# CONFIG_DRM_AST is not set
+# CONFIG_DRM_MGAG200 is not set
+# CONFIG_DRM_RCAR_DW_HDMI is not set
+# CONFIG_DRM_RCAR_USE_LVDS is not set
+# CONFIG_DRM_RCAR_USE_MIPI_DSI is not set
+# CONFIG_DRM_QXL is not set
+CONFIG_DRM_PANEL=y
+
+#
+# Display Panels
+#
+# CONFIG_DRM_PANEL_ABT_Y030XX067A is not set
+# CONFIG_DRM_PANEL_ARM_VERSATILE is not set
+# CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596 is not set
+# CONFIG_DRM_PANEL_BOE_BF060Y8M_AJ0 is not set
+# CONFIG_DRM_PANEL_BOE_HIMAX8279D is not set
+# CONFIG_DRM_PANEL_BOE_TV101WUM_NL6 is not set
+# CONFIG_DRM_PANEL_DSI_CM is not set
+# CONFIG_DRM_PANEL_LVDS is not set
+CONFIG_DRM_PANEL_SIMPLE=y
+# CONFIG_DRM_PANEL_EDP is not set
+# CONFIG_DRM_PANEL_EBBG_FT8719 is not set
+# CONFIG_DRM_PANEL_ELIDA_KD35T133 is not set
+# CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02 is not set
+# CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D is not set
+# CONFIG_DRM_PANEL_ILITEK_IL9322 is not set
+# CONFIG_DRM_PANEL_ILITEK_ILI9341 is not set
+# CONFIG_DRM_PANEL_ILITEK_ILI9806E is not set
+# CONFIG_DRM_PANEL_ILITEK_ILI9881C is not set
+# CONFIG_DRM_PANEL_INNOLUX_EJ030NA is not set
+# CONFIG_DRM_PANEL_INNOLUX_P079ZCA is not set
+# CONFIG_DRM_PANEL_JDI_LT070ME05000 is not set
+# CONFIG_DRM_PANEL_JDI_R63452 is not set
+# CONFIG_DRM_PANEL_KHADAS_TS050 is not set
+# CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04 is not set
+# CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W is not set
+# CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_LD9040 is not set
+# CONFIG_DRM_PANEL_LG_LB035Q02 is not set
+# CONFIG_DRM_PANEL_LG_LG4573 is not set
+# CONFIG_DRM_PANEL_NEC_NL8048HL11 is not set
+# CONFIG_DRM_PANEL_NEWVISION_NV3052C is not set
+# CONFIG_DRM_PANEL_NOVATEK_NT35510 is not set
+# CONFIG_DRM_PANEL_NOVATEK_NT35560 is not set
+# CONFIG_DRM_PANEL_NOVATEK_NT35950 is not set
+# CONFIG_DRM_PANEL_NOVATEK_NT36672A is not set
+# CONFIG_DRM_PANEL_NOVATEK_NT39016 is not set
+# CONFIG_DRM_PANEL_MANTIX_MLAF057WE51 is not set
+# CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO is not set
+# CONFIG_DRM_PANEL_ORISETECH_OTM8009A is not set
+# CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS is not set
+# CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00 is not set
+# CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN is not set
+# CONFIG_DRM_PANEL_RAYDIUM_RM67191 is not set
+# CONFIG_DRM_PANEL_RAYDIUM_RM68200 is not set
+# CONFIG_DRM_PANEL_RONBO_RB070D30 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_ATNA33XC20 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_DB7430 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6D16D0 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6D27A1 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6E63M0 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0 is not set
+# CONFIG_DRM_PANEL_SAMSUNG_SOFEF00 is not set
+# CONFIG_DRM_PANEL_SEIKO_43WVF1G is not set
+# CONFIG_DRM_PANEL_SHARP_LQ101R1SX01 is not set
+# CONFIG_DRM_PANEL_SHARP_LS037V7DW01 is not set
+# CONFIG_DRM_PANEL_SHARP_LS043T1LE01 is not set
+# CONFIG_DRM_PANEL_SHARP_LS060T1SX01 is not set
+# CONFIG_DRM_PANEL_SITRONIX_ST7701 is not set
+# CONFIG_DRM_PANEL_SITRONIX_ST7703 is not set
+# CONFIG_DRM_PANEL_SITRONIX_ST7789V is not set
+# CONFIG_DRM_PANEL_SONY_ACX565AKM is not set
+# CONFIG_DRM_PANEL_SONY_TULIP_TRULY_NT35521 is not set
+# CONFIG_DRM_PANEL_TDO_TL070WSH30 is not set
+# CONFIG_DRM_PANEL_TPO_Y17P is not set
+# CONFIG_DRM_PANEL_TPO_TD028TTEC1 is not set
+# CONFIG_DRM_PANEL_TPO_TD043MTEA1 is not set
+# CONFIG_DRM_PANEL_TPO_TPG110 is not set
+# CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA is not set
+# CONFIG_DRM_PANEL_VISIONOX_RM69299 is not set
+# CONFIG_DRM_PANEL_WAVESHARE_TOUCHSCREEN is not set
+# CONFIG_DRM_PANEL_WIDECHIPS_WS2401 is not set
+# CONFIG_DRM_PANEL_XINPENG_XPP055C272 is not set
+# end of Display Panels
+
+CONFIG_DRM_BRIDGE=y
+CONFIG_DRM_PANEL_BRIDGE=y
+
+#
+# Display Interface Bridges
+#
+# CONFIG_DRM_CDNS_DSI is not set
+# CONFIG_DRM_CHIPONE_ICN6211 is not set
+# CONFIG_DRM_CHRONTEL_CH7033 is not set
+# CONFIG_DRM_DISPLAY_CONNECTOR is not set
+# CONFIG_DRM_ITE_IT6505 is not set
+# CONFIG_DRM_LONTIUM_LT8912B is not set
+# CONFIG_DRM_LONTIUM_LT9211 is not set
+# CONFIG_DRM_LONTIUM_LT9611 is not set
+# CONFIG_DRM_LONTIUM_LT9611UXC is not set
+# CONFIG_DRM_ITE_IT66121 is not set
+# CONFIG_DRM_LVDS_CODEC is not set
+# CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW is not set
+# CONFIG_DRM_NWL_MIPI_DSI is not set
+# CONFIG_DRM_NXP_PTN3460 is not set
+# CONFIG_DRM_PARADE_PS8622 is not set
+# CONFIG_DRM_PARADE_PS8640 is not set
+# CONFIG_DRM_SIL_SII8620 is not set
+# CONFIG_DRM_SII902X is not set
+# CONFIG_DRM_SII9234 is not set
+CONFIG_DRM_SIMPLE_BRIDGE=y
+# CONFIG_DRM_THINE_THC63LVD1024 is not set
+CONFIG_DRM_TOSHIBA_TC358762=y
+# CONFIG_DRM_TOSHIBA_TC358764 is not set
+# CONFIG_DRM_TOSHIBA_TC358767 is not set
+# CONFIG_DRM_TOSHIBA_TC358768 is not set
+# CONFIG_DRM_TOSHIBA_TC358775 is not set
+# CONFIG_DRM_TI_DLPC3433 is not set
+# CONFIG_DRM_TI_TFP410 is not set
+# CONFIG_DRM_TI_SN65DSI83 is not set
+# CONFIG_DRM_TI_SN65DSI86 is not set
+# CONFIG_DRM_TI_TPD12S015 is not set
+# CONFIG_DRM_ANALOGIX_ANX6345 is not set
+# CONFIG_DRM_ANALOGIX_ANX78XX is not set
+# CONFIG_DRM_ANALOGIX_ANX7625 is not set
+# CONFIG_DRM_I2C_ADV7511 is not set
+# CONFIG_DRM_CDNS_MHDP8546 is not set
+# end of Display Interface Bridges
+
+CONFIG_DRM_V3D=y
+CONFIG_DRM_VC4=y
+CONFIG_DRM_VC4_HDMI_CEC=y
+CONFIG_DRM_RP1_DSI=y
+CONFIG_DRM_RP1_DPI=y
+CONFIG_DRM_RP1_VEC=y
+# CONFIG_DRM_ETNAVIV is not set
+# CONFIG_DRM_HISI_HIBMC is not set
+# CONFIG_DRM_HISI_KIRIN is not set
+# CONFIG_DRM_LOGICVC is not set
+# CONFIG_DRM_ARCPGU is not set
+# CONFIG_DRM_BOCHS is not set
+# CONFIG_DRM_CIRRUS_QEMU is not set
+# CONFIG_DRM_GM12U320 is not set
+# CONFIG_DRM_PANEL_MIPI_DBI is not set
+# CONFIG_DRM_SIMPLEDRM is not set
+# CONFIG_TINYDRM_HX8357D is not set
+# CONFIG_TINYDRM_ILI9163 is not set
+# CONFIG_TINYDRM_ILI9225 is not set
+# CONFIG_TINYDRM_ILI9341 is not set
+# CONFIG_TINYDRM_ILI9486 is not set
+# CONFIG_TINYDRM_MI0283QT is not set
+# CONFIG_TINYDRM_REPAPER is not set
+# CONFIG_TINYDRM_ST7586 is not set
+# CONFIG_TINYDRM_ST7735R is not set
+# CONFIG_DRM_PL111 is not set
+# CONFIG_DRM_LIMA is not set
+# CONFIG_DRM_PANFROST is not set
+# CONFIG_DRM_TIDSS is not set
+# CONFIG_DRM_GUD is not set
+# CONFIG_DRM_SSD130X is not set
+# CONFIG_DRM_LEGACY is not set
+CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y
+CONFIG_DRM_NOMODESET=y
+
+#
+# Frame buffer Devices
+#
+CONFIG_FB_CMDLINE=y
+CONFIG_FB_NOTIFY=y
+CONFIG_FB=y
+# CONFIG_FIRMWARE_EDID is not set
+CONFIG_FB_CFB_FILLRECT=y
+CONFIG_FB_CFB_COPYAREA=y
+CONFIG_FB_CFB_IMAGEBLIT=y
+CONFIG_FB_SYS_FILLRECT=y
+CONFIG_FB_SYS_COPYAREA=y
+CONFIG_FB_SYS_IMAGEBLIT=y
+# CONFIG_FB_FOREIGN_ENDIAN is not set
+CONFIG_FB_SYS_FOPS=y
+CONFIG_FB_DEFERRED_IO=y
+# CONFIG_FB_MODE_HELPERS is not set
+# CONFIG_FB_TILEBLITTING is not set
+
+#
+# Frame buffer hardware drivers
+#
+# CONFIG_FB_BCM2708 is not set
+# CONFIG_FB_CIRRUS is not set
+# CONFIG_FB_PM2 is not set
+# CONFIG_FB_ARMCLCD is not set
+# CONFIG_FB_CYBER2000 is not set
+# CONFIG_FB_ASILIANT is not set
+# CONFIG_FB_IMSTT is not set
+# CONFIG_FB_EFI is not set
+# CONFIG_FB_OPENCORES is not set
+# CONFIG_FB_S1D13XXX is not set
+# CONFIG_FB_NVIDIA is not set
+# CONFIG_FB_RIVA is not set
+# CONFIG_FB_I740 is not set
+# CONFIG_FB_MATROX is not set
+# CONFIG_FB_RADEON is not set
+# CONFIG_FB_ATY128 is not set
+# CONFIG_FB_ATY is not set
+# CONFIG_FB_S3 is not set
+# CONFIG_FB_SAVAGE is not set
+# CONFIG_FB_SIS is not set
+# CONFIG_FB_NEOMAGIC is not set
+# CONFIG_FB_KYRO is not set
+# CONFIG_FB_3DFX is not set
+# CONFIG_FB_VOODOO1 is not set
+# CONFIG_FB_VT8623 is not set
+# CONFIG_FB_TRIDENT is not set
+# CONFIG_FB_ARK is not set
+# CONFIG_FB_PM3 is not set
+# CONFIG_FB_CARMINE is not set
+# CONFIG_FB_SMSCUFX is not set
+# CONFIG_FB_UDL is not set
+# CONFIG_FB_IBM_GXT4500 is not set
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_SIMPLE is not set
+# CONFIG_FB_SSD1307 is not set
+# CONFIG_FB_SM712 is not set
+# CONFIG_FB_RPISENSE is not set
+# end of Frame buffer Devices
+
+#
+# Backlight & LCD device support
+#
+CONFIG_LCD_CLASS_DEVICE=m
+# CONFIG_LCD_L4F00242T03 is not set
+# CONFIG_LCD_LMS283GF05 is not set
+# CONFIG_LCD_LTV350QV is not set
+# CONFIG_LCD_ILI922X is not set
+# CONFIG_LCD_ILI9320 is not set
+# CONFIG_LCD_TDO24M is not set
+# CONFIG_LCD_VGG2432A4 is not set
+# CONFIG_LCD_PLATFORM is not set
+# CONFIG_LCD_AMS369FG06 is not set
+# CONFIG_LCD_LMS501KF03 is not set
+# CONFIG_LCD_HX8357 is not set
+# CONFIG_LCD_OTM3225A is not set
+CONFIG_BACKLIGHT_CLASS_DEVICE=y
+# CONFIG_BACKLIGHT_KTD253 is not set
+# CONFIG_BACKLIGHT_PWM is not set
+CONFIG_BACKLIGHT_RPI=y
+# CONFIG_BACKLIGHT_QCOM_WLED is not set
+# CONFIG_BACKLIGHT_ADP8860 is not set
+# CONFIG_BACKLIGHT_ADP8870 is not set
+# CONFIG_BACKLIGHT_LM3630A is not set
+# CONFIG_BACKLIGHT_LM3639 is not set
+# CONFIG_BACKLIGHT_LP855X is not set
+CONFIG_BACKLIGHT_GPIO=y
+# CONFIG_BACKLIGHT_LV5207LP is not set
+# CONFIG_BACKLIGHT_BD6107 is not set
+# CONFIG_BACKLIGHT_ARCXCNN is not set
+# CONFIG_BACKLIGHT_LED is not set
+# end of Backlight & LCD device support
+
+CONFIG_VIDEOMODE_HELPERS=y
+CONFIG_HDMI=y
+
+#
+# Console display driver support
+#
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_DUMMY_CONSOLE_COLUMNS=80
+CONFIG_DUMMY_CONSOLE_ROWS=25
+CONFIG_FRAMEBUFFER_CONSOLE=y
+# CONFIG_FRAMEBUFFER_CONSOLE_LEGACY_ACCELERATION is not set
+CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y
+# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set
+# CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER is not set
+# end of Console display driver support
+
+# CONFIG_LOGO is not set
+# end of Graphics support
+
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_SND_TIMER=y
+CONFIG_SND_PCM=y
+CONFIG_SND_PCM_ELD=y
+CONFIG_SND_PCM_IEC958=y
+CONFIG_SND_DMAENGINE_PCM=y
+CONFIG_SND_HWDEP=m
+CONFIG_SND_RAWMIDI=m
+CONFIG_SND_COMPRESS_OFFLOAD=y
+CONFIG_SND_JACK=y
+CONFIG_SND_JACK_INPUT_DEV=y
+# CONFIG_SND_OSSEMUL is not set
+CONFIG_SND_PCM_TIMER=y
+CONFIG_SND_HRTIMER=m
+CONFIG_SND_DYNAMIC_MINORS=y
+CONFIG_SND_MAX_CARDS=32
+# CONFIG_SND_SUPPORT_OLD_API is not set
+CONFIG_SND_PROC_FS=y
+CONFIG_SND_VERBOSE_PROCFS=y
+# CONFIG_SND_VERBOSE_PRINTK is not set
+CONFIG_SND_CTL_FAST_LOOKUP=y
+# CONFIG_SND_DEBUG is not set
+# CONFIG_SND_CTL_INPUT_VALIDATION is not set
+CONFIG_SND_VMASTER=y
+# CONFIG_SND_SEQUENCER is not set
+# CONFIG_SND_DRIVERS is not set
+CONFIG_SND_PCI=y
+# CONFIG_SND_AD1889 is not set
+# CONFIG_SND_ALS300 is not set
+# CONFIG_SND_ALI5451 is not set
+# CONFIG_SND_ATIIXP is not set
+# CONFIG_SND_ATIIXP_MODEM is not set
+# CONFIG_SND_AU8810 is not set
+# CONFIG_SND_AU8820 is not set
+# CONFIG_SND_AU8830 is not set
+# CONFIG_SND_AW2 is not set
+# CONFIG_SND_AZT3328 is not set
+# CONFIG_SND_BT87X is not set
+# CONFIG_SND_CA0106 is not set
+# CONFIG_SND_CMIPCI is not set
+# CONFIG_SND_OXYGEN is not set
+# CONFIG_SND_CS4281 is not set
+# CONFIG_SND_CS46XX is not set
+# CONFIG_SND_CTXFI is not set
+# CONFIG_SND_DARLA20 is not set
+# CONFIG_SND_GINA20 is not set
+# CONFIG_SND_LAYLA20 is not set
+# CONFIG_SND_DARLA24 is not set
+# CONFIG_SND_GINA24 is not set
+# CONFIG_SND_LAYLA24 is not set
+# CONFIG_SND_MONA is not set
+# CONFIG_SND_MIA is not set
+# CONFIG_SND_ECHO3G is not set
+# CONFIG_SND_INDIGO is not set
+# CONFIG_SND_INDIGOIO is not set
+# CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
+# CONFIG_SND_EMU10K1 is not set
+# CONFIG_SND_EMU10K1X is not set
+# CONFIG_SND_ENS1370 is not set
+# CONFIG_SND_ENS1371 is not set
+# CONFIG_SND_ES1938 is not set
+# CONFIG_SND_ES1968 is not set
+# CONFIG_SND_FM801 is not set
+# CONFIG_SND_HDSP is not set
+# CONFIG_SND_HDSPM is not set
+# CONFIG_SND_ICE1712 is not set
+# CONFIG_SND_ICE1724 is not set
+# CONFIG_SND_INTEL8X0 is not set
+# CONFIG_SND_INTEL8X0M is not set
+# CONFIG_SND_KORG1212 is not set
+# CONFIG_SND_LOLA is not set
+# CONFIG_SND_LX6464ES is not set
+# CONFIG_SND_MAESTRO3 is not set
+# CONFIG_SND_MIXART is not set
+# CONFIG_SND_NM256 is not set
+# CONFIG_SND_PCXHR is not set
+# CONFIG_SND_RIPTIDE is not set
+# CONFIG_SND_RME32 is not set
+# CONFIG_SND_RME96 is not set
+# CONFIG_SND_RME9652 is not set
+# CONFIG_SND_SE6X is not set
+# CONFIG_SND_SONICVIBES is not set
+# CONFIG_SND_TRIDENT is not set
+# CONFIG_SND_VIA82XX is not set
+# CONFIG_SND_VIA82XX_MODEM is not set
+# CONFIG_SND_VIRTUOSO is not set
+# CONFIG_SND_VX222 is not set
+# CONFIG_SND_YMFPCI is not set
+
+#
+# HD-Audio
+#
+# CONFIG_SND_HDA_INTEL is not set
+# end of HD-Audio
+
+CONFIG_SND_HDA_PREALLOC_SIZE=2048
+CONFIG_SND_SPI=y
+CONFIG_SND_USB=y
+CONFIG_SND_USB_AUDIO=m
+CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y
+CONFIG_SND_USB_UA101=m
+CONFIG_SND_USB_CAIAQ=m
+CONFIG_SND_USB_CAIAQ_INPUT=y
+CONFIG_SND_USB_6FIRE=m
+CONFIG_SND_USB_HIFACE=m
+CONFIG_SND_BCD2000=m
+CONFIG_SND_USB_LINE6=m
+CONFIG_SND_USB_POD=m
+CONFIG_SND_USB_PODHD=m
+CONFIG_SND_USB_TONEPORT=m
+CONFIG_SND_USB_VARIAX=m
+CONFIG_SND_SOC=y
+CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y
+CONFIG_SND_SOC_COMPRESS=y
+# CONFIG_SND_SOC_ADI is not set
+# CONFIG_SND_SOC_AMD_ACP is not set
+# CONFIG_SND_AMD_ACP_CONFIG is not set
+# CONFIG_SND_ATMEL_SOC is not set
+CONFIG_SND_BCM2835_SOC_I2S=m
+# CONFIG_SND_BCM63XX_I2S_WHISTLER is not set
+CONFIG_SND_BCM2708_SOC_CHIPDIP_DAC=m
+CONFIG_SND_BCM2708_SOC_GOOGLEVOICEHAT_SOUNDCARD=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DAC=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUS=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSHD=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSADC=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSADCPRO=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DACPLUSDSP=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_DIGI=m
+CONFIG_SND_BCM2708_SOC_HIFIBERRY_AMP=m
+CONFIG_SND_BCM2708_SOC_PIFI_40=m
+CONFIG_SND_BCM2708_SOC_RPI_CIRRUS=m
+CONFIG_SND_BCM2708_SOC_RPI_DAC=m
+CONFIG_SND_BCM2708_SOC_RPI_PROTO=m
+CONFIG_SND_BCM2708_SOC_JUSTBOOM_BOTH=m
+CONFIG_SND_BCM2708_SOC_JUSTBOOM_DAC=m
+CONFIG_SND_BCM2708_SOC_JUSTBOOM_DIGI=m
+CONFIG_SND_BCM2708_SOC_IQAUDIO_CODEC=m
+CONFIG_SND_BCM2708_SOC_IQAUDIO_DAC=m
+CONFIG_SND_BCM2708_SOC_IQAUDIO_DIGI=m
+CONFIG_SND_BCM2708_SOC_I_SABRE_Q2M=m
+CONFIG_SND_BCM2708_SOC_ADAU1977_ADC=m
+CONFIG_SND_AUDIOINJECTOR_PI_SOUNDCARD=m
+CONFIG_SND_AUDIOINJECTOR_OCTO_SOUNDCARD=m
+CONFIG_SND_AUDIOINJECTOR_ISOLATED_SOUNDCARD=m
+CONFIG_SND_AUDIOSENSE_PI=m
+CONFIG_SND_DIGIDAC1_SOUNDCARD=m
+CONFIG_SND_BCM2708_SOC_DIONAUDIO_LOCO=m
+CONFIG_SND_BCM2708_SOC_DIONAUDIO_LOCO_V2=m
+CONFIG_SND_BCM2708_SOC_ALLO_PIANO_DAC=m
+CONFIG_SND_BCM2708_SOC_ALLO_PIANO_DAC_PLUS=m
+CONFIG_SND_BCM2708_SOC_ALLO_BOSS_DAC=m
+CONFIG_SND_BCM2708_SOC_ALLO_BOSS2_DAC=m
+CONFIG_SND_BCM2708_SOC_ALLO_DIGIONE=m
+CONFIG_SND_BCM2708_SOC_ALLO_KATANA_DAC=m
+CONFIG_SND_BCM2708_SOC_FE_PI_AUDIO=m
+CONFIG_SND_PISOUND=m
+CONFIG_SND_RPI_SIMPLE_SOUNDCARD=m
+CONFIG_SND_RPI_WM8804_SOUNDCARD=m
+CONFIG_SND_DACBERRY400=m
+CONFIG_SND_DESIGNWARE_I2S=m
+CONFIG_SND_DESIGNWARE_PCM=y
+
+#
+# SoC Audio for Freescale CPUs
+#
+
+#
+# Common SoC Audio options for Freescale CPUs:
+#
+# CONFIG_SND_SOC_FSL_ASRC is not set
+# CONFIG_SND_SOC_FSL_SAI is not set
+# CONFIG_SND_SOC_FSL_AUDMIX is not set
+# CONFIG_SND_SOC_FSL_SSI is not set
+# CONFIG_SND_SOC_FSL_SPDIF is not set
+# CONFIG_SND_SOC_FSL_ESAI is not set
+# CONFIG_SND_SOC_FSL_MICFIL is not set
+# CONFIG_SND_SOC_FSL_XCVR is not set
+# CONFIG_SND_SOC_IMX_AUDMUX is not set
+# end of SoC Audio for Freescale CPUs
+
+# CONFIG_SND_I2S_HI6210_I2S is not set
+# CONFIG_SND_SOC_IMG is not set
+# CONFIG_SND_SOC_MTK_BTCVSD is not set
+# CONFIG_SND_SOC_SOF_TOPLEVEL is not set
+
+#
+# STMicroelectronics STM32 SOC audio support
+#
+# end of STMicroelectronics STM32 SOC audio support
+
+# CONFIG_SND_SOC_XILINX_I2S is not set
+# CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER is not set
+# CONFIG_SND_SOC_XILINX_SPDIF is not set
+# CONFIG_SND_SOC_XTFPGA_I2S is not set
+CONFIG_SND_SOC_I2C_AND_SPI=y
+
+#
+# CODEC drivers
+#
+CONFIG_SND_SOC_ARIZONA=m
+CONFIG_SND_SOC_WM_ADSP=m
+# CONFIG_SND_SOC_AC97_CODEC is not set
+# CONFIG_SND_SOC_AD193X_SPI is not set
+# CONFIG_SND_SOC_AD193X_I2C is not set
+# CONFIG_SND_SOC_ADAU1372_I2C is not set
+# CONFIG_SND_SOC_ADAU1372_SPI is not set
+CONFIG_SND_SOC_ADAU1701=m
+# CONFIG_SND_SOC_ADAU1761_I2C is not set
+# CONFIG_SND_SOC_ADAU1761_SPI is not set
+CONFIG_SND_SOC_ADAU1977=m
+CONFIG_SND_SOC_ADAU1977_I2C=m
+CONFIG_SND_SOC_ADAU7002=m
+# CONFIG_SND_SOC_ADAU7118_HW is not set
+# CONFIG_SND_SOC_ADAU7118_I2C is not set
+# CONFIG_SND_SOC_AK4104 is not set
+# CONFIG_SND_SOC_AK4118 is not set
+# CONFIG_SND_SOC_AK4375 is not set
+# CONFIG_SND_SOC_AK4458 is not set
+CONFIG_SND_SOC_AK4554=m
+# CONFIG_SND_SOC_AK4613 is not set
+# CONFIG_SND_SOC_AK4642 is not set
+# CONFIG_SND_SOC_AK5386 is not set
+# CONFIG_SND_SOC_AK5558 is not set
+# CONFIG_SND_SOC_ALC5623 is not set
+# CONFIG_SND_SOC_AW8738 is not set
+# CONFIG_SND_SOC_BD28623 is not set
+# CONFIG_SND_SOC_BT_SCO is not set
+# CONFIG_SND_SOC_CS35L32 is not set
+# CONFIG_SND_SOC_CS35L33 is not set
+# CONFIG_SND_SOC_CS35L34 is not set
+# CONFIG_SND_SOC_CS35L35 is not set
+# CONFIG_SND_SOC_CS35L36 is not set
+# CONFIG_SND_SOC_CS35L41_SPI is not set
+# CONFIG_SND_SOC_CS35L41_I2C is not set
+# CONFIG_SND_SOC_CS35L45_SPI is not set
+# CONFIG_SND_SOC_CS35L45_I2C is not set
+# CONFIG_SND_SOC_CS42L42 is not set
+# CONFIG_SND_SOC_CS42L51_I2C is not set
+# CONFIG_SND_SOC_CS42L52 is not set
+# CONFIG_SND_SOC_CS42L56 is not set
+# CONFIG_SND_SOC_CS42L73 is not set
+# CONFIG_SND_SOC_CS42L83 is not set
+# CONFIG_SND_SOC_CS4234 is not set
+CONFIG_SND_SOC_CS4265=m
+# CONFIG_SND_SOC_CS4270 is not set
+CONFIG_SND_SOC_CS4271=m
+CONFIG_SND_SOC_CS4271_I2C=m
+# CONFIG_SND_SOC_CS4271_SPI is not set
+CONFIG_SND_SOC_CS42XX8=m
+CONFIG_SND_SOC_CS42XX8_I2C=m
+# CONFIG_SND_SOC_CS43130 is not set
+# CONFIG_SND_SOC_CS4341 is not set
+# CONFIG_SND_SOC_CS4349 is not set
+# CONFIG_SND_SOC_CS53L30 is not set
+# CONFIG_SND_SOC_CX2072X is not set
+CONFIG_SND_SOC_DA7213=m
+CONFIG_SND_SOC_DMIC=m
+CONFIG_SND_SOC_HDMI_CODEC=y
+# CONFIG_SND_SOC_ES7134 is not set
+# CONFIG_SND_SOC_ES7241 is not set
+# CONFIG_SND_SOC_ES8316 is not set
+# CONFIG_SND_SOC_ES8326 is not set
+# CONFIG_SND_SOC_ES8328_I2C is not set
+# CONFIG_SND_SOC_ES8328_SPI is not set
+# CONFIG_SND_SOC_GTM601 is not set
+# CONFIG_SND_SOC_HDA is not set
+# CONFIG_SND_SOC_ICS43432 is not set
+# CONFIG_SND_SOC_INNO_RK3036 is not set
+CONFIG_SND_SOC_MA120X0P=m
+# CONFIG_SND_SOC_MAX98088 is not set
+# CONFIG_SND_SOC_MAX98357A is not set
+# CONFIG_SND_SOC_MAX98504 is not set
+# CONFIG_SND_SOC_MAX9867 is not set
+# CONFIG_SND_SOC_MAX98927 is not set
+# CONFIG_SND_SOC_MAX98520 is not set
+# CONFIG_SND_SOC_MAX98373_I2C is not set
+# CONFIG_SND_SOC_MAX98390 is not set
+# CONFIG_SND_SOC_MAX98396 is not set
+# CONFIG_SND_SOC_MAX9860 is not set
+# CONFIG_SND_SOC_MSM8916_WCD_DIGITAL is not set
+# CONFIG_SND_SOC_PCM1681 is not set
+# CONFIG_SND_SOC_PCM1789_I2C is not set
+CONFIG_SND_SOC_PCM179X=m
+CONFIG_SND_SOC_PCM179X_I2C=m
+# CONFIG_SND_SOC_PCM179X_SPI is not set
+CONFIG_SND_SOC_PCM186X=m
+CONFIG_SND_SOC_PCM186X_I2C=m
+# CONFIG_SND_SOC_PCM186X_SPI is not set
+# CONFIG_SND_SOC_PCM3060_I2C is not set
+# CONFIG_SND_SOC_PCM3060_SPI is not set
+# CONFIG_SND_SOC_PCM3168A_I2C is not set
+# CONFIG_SND_SOC_PCM3168A_SPI is not set
+CONFIG_SND_SOC_PCM5102A=m
+CONFIG_SND_SOC_PCM512x=m
+CONFIG_SND_SOC_PCM512x_I2C=m
+# CONFIG_SND_SOC_PCM512x_SPI is not set
+# CONFIG_SND_SOC_RK3328 is not set
+# CONFIG_SND_SOC_RT5616 is not set
+CONFIG_SND_SOC_PCM1794A=m
+# CONFIG_SND_SOC_RT5631 is not set
+# CONFIG_SND_SOC_RT5640 is not set
+# CONFIG_SND_SOC_RT5659 is not set
+# CONFIG_SND_SOC_RT9120 is not set
+CONFIG_SND_SOC_SGTL5000=m
+CONFIG_SND_SOC_SIGMADSP=m
+CONFIG_SND_SOC_SIGMADSP_I2C=m
+CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m
+# CONFIG_SND_SOC_SIMPLE_MUX is not set
+CONFIG_SND_SOC_SPDIF=m
+# CONFIG_SND_SOC_SRC4XXX_I2C is not set
+# CONFIG_SND_SOC_SSM2305 is not set
+# CONFIG_SND_SOC_SSM2518 is not set
+# CONFIG_SND_SOC_SSM2602_SPI is not set
+# CONFIG_SND_SOC_SSM2602_I2C is not set
+# CONFIG_SND_SOC_SSM4567 is not set
+CONFIG_SND_SOC_STA32X=m
+# CONFIG_SND_SOC_STA350 is not set
+# CONFIG_SND_SOC_STI_SAS is not set
+# CONFIG_SND_SOC_TAS2552 is not set
+# CONFIG_SND_SOC_TAS2562 is not set
+# CONFIG_SND_SOC_TAS2764 is not set
+# CONFIG_SND_SOC_TAS2770 is not set
+# CONFIG_SND_SOC_TAS2780 is not set
+# CONFIG_SND_SOC_TAS5086 is not set
+CONFIG_SND_SOC_TAS571X=m
+# CONFIG_SND_SOC_TAS5720 is not set
+# CONFIG_SND_SOC_TAS5805M is not set
+# CONFIG_SND_SOC_TAS6424 is not set
+# CONFIG_SND_SOC_TDA7419 is not set
+# CONFIG_SND_SOC_TFA9879 is not set
+CONFIG_SND_SOC_TAS5713=m
+# CONFIG_SND_SOC_TFA989X is not set
+# CONFIG_SND_SOC_TLV320ADC3XXX is not set
+# CONFIG_SND_SOC_TLV320AIC23_I2C is not set
+# CONFIG_SND_SOC_TLV320AIC23_SPI is not set
+# CONFIG_SND_SOC_TLV320AIC31XX is not set
+CONFIG_SND_SOC_TLV320AIC32X4=m
+CONFIG_SND_SOC_TLV320AIC32X4_I2C=m
+# CONFIG_SND_SOC_TLV320AIC32X4_SPI is not set
+CONFIG_SND_SOC_TLV320AIC3X=m
+CONFIG_SND_SOC_TLV320AIC3X_I2C=m
+# CONFIG_SND_SOC_TLV320AIC3X_SPI is not set
+# CONFIG_SND_SOC_TLV320ADCX140 is not set
+# CONFIG_SND_SOC_TS3A227E is not set
+# CONFIG_SND_SOC_TSCS42XX is not set
+# CONFIG_SND_SOC_TSCS454 is not set
+# CONFIG_SND_SOC_UDA1334 is not set
+CONFIG_SND_SOC_WM5102=m
+# CONFIG_SND_SOC_WM8510 is not set
+# CONFIG_SND_SOC_WM8523 is not set
+# CONFIG_SND_SOC_WM8524 is not set
+# CONFIG_SND_SOC_WM8580 is not set
+# CONFIG_SND_SOC_WM8711 is not set
+# CONFIG_SND_SOC_WM8728 is not set
+CONFIG_SND_SOC_WM8731=m
+CONFIG_SND_SOC_WM8731_I2C=m
+# CONFIG_SND_SOC_WM8731_SPI is not set
+# CONFIG_SND_SOC_WM8737 is not set
+CONFIG_SND_SOC_WM8741=m
+# CONFIG_SND_SOC_WM8750 is not set
+# CONFIG_SND_SOC_WM8753 is not set
+# CONFIG_SND_SOC_WM8770 is not set
+# CONFIG_SND_SOC_WM8776 is not set
+# CONFIG_SND_SOC_WM8782 is not set
+CONFIG_SND_SOC_WM8804=m
+CONFIG_SND_SOC_WM8804_I2C=m
+# CONFIG_SND_SOC_WM8804_SPI is not set
+# CONFIG_SND_SOC_WM8903 is not set
+# CONFIG_SND_SOC_WM8904 is not set
+# CONFIG_SND_SOC_WM8940 is not set
+CONFIG_SND_SOC_WM8960=m
+# CONFIG_SND_SOC_WM8962 is not set
+# CONFIG_SND_SOC_WM8974 is not set
+# CONFIG_SND_SOC_WM8978 is not set
+# CONFIG_SND_SOC_WM8985 is not set
+# CONFIG_SND_SOC_ZL38060 is not set
+# CONFIG_SND_SOC_MAX9759 is not set
+# CONFIG_SND_SOC_MT6351 is not set
+# CONFIG_SND_SOC_MT6358 is not set
+# CONFIG_SND_SOC_MT6660 is not set
+# CONFIG_SND_SOC_NAU8315 is not set
+# CONFIG_SND_SOC_NAU8540 is not set
+# CONFIG_SND_SOC_NAU8810 is not set
+# CONFIG_SND_SOC_NAU8821 is not set
+# CONFIG_SND_SOC_NAU8822 is not set
+# CONFIG_SND_SOC_NAU8824 is not set
+CONFIG_SND_SOC_TPA6130A2=m
+# CONFIG_SND_SOC_LPASS_WSA_MACRO is not set
+# CONFIG_SND_SOC_LPASS_VA_MACRO is not set
+# CONFIG_SND_SOC_LPASS_RX_MACRO is not set
+# CONFIG_SND_SOC_LPASS_TX_MACRO is not set
+CONFIG_SND_SOC_I_SABRE_CODEC=m
+# end of CODEC drivers
+
+CONFIG_SND_SIMPLE_CARD_UTILS=m
+CONFIG_SND_SIMPLE_CARD=m
+CONFIG_SND_AUDIO_GRAPH_CARD=m
+# CONFIG_SND_AUDIO_GRAPH_CARD2 is not set
+# CONFIG_SND_TEST_COMPONENT is not set
+
+#
+# HID support
+#
+CONFIG_HID=y
+# CONFIG_HID_BATTERY_STRENGTH is not set
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_GENERIC=y
+
+#
+# Special HID drivers
+#
+CONFIG_HID_A4TECH=y
+# CONFIG_HID_ACCUTOUCH is not set
+# CONFIG_HID_ACRUX is not set
+CONFIG_HID_APPLE=y
+# CONFIG_HID_APPLEIR is not set
+CONFIG_HID_ASUS=y
+CONFIG_HID_AUREAL=y
+CONFIG_HID_BELKIN=y
+# CONFIG_HID_BETOP_FF is not set
+CONFIG_HID_BIGBEN_FF=m
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+# CONFIG_HID_CORSAIR is not set
+# CONFIG_HID_COUGAR is not set
+# CONFIG_HID_MACALLY is not set
+# CONFIG_HID_PRODIKEYS is not set
+# CONFIG_HID_CMEDIA is not set
+# CONFIG_HID_CP2112 is not set
+# CONFIG_HID_CREATIVE_SB0540 is not set
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=m
+CONFIG_DRAGONRISE_FF=y
+# CONFIG_HID_EMS_FF is not set
+# CONFIG_HID_ELAN is not set
+# CONFIG_HID_ELECOM is not set
+# CONFIG_HID_ELO is not set
+CONFIG_HID_EZKEY=y
+# CONFIG_HID_FT260 is not set
+# CONFIG_HID_GEMBIRD is not set
+# CONFIG_HID_GFRM is not set
+# CONFIG_HID_GLORIOUS is not set
+# CONFIG_HID_HOLTEK is not set
+# CONFIG_HID_VIVALDI is not set
+# CONFIG_HID_GT683R is not set
+# CONFIG_HID_KEYTOUCH is not set
+CONFIG_HID_KYE=y
+# CONFIG_HID_UCLOGIC is not set
+# CONFIG_HID_WALTOP is not set
+# CONFIG_HID_VIEWSONIC is not set
+# CONFIG_HID_VRC2 is not set
+# CONFIG_HID_XIAOMI is not set
+CONFIG_HID_GYRATION=y
+# CONFIG_HID_ICADE is not set
+# CONFIG_HID_ITE is not set
+# CONFIG_HID_JABRA is not set
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+# CONFIG_HID_LED is not set
+CONFIG_HID_LENOVO=y
+# CONFIG_HID_LETSKETCH is not set
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_LOGITECH_HIDPP=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIWHEELS_FF=y
+# CONFIG_HID_MAGICMOUSE is not set
+# CONFIG_HID_MALTRON is not set
+# CONFIG_HID_MAYFLASH is not set
+# CONFIG_HID_MEGAWORLD_FF is not set
+# CONFIG_HID_REDRAGON is not set
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=m
+CONFIG_HID_NINTENDO=m
+CONFIG_NINTENDO_FF=y
+# CONFIG_HID_NTI is not set
+# CONFIG_HID_NTRIG is not set
+CONFIG_HID_ORTEK=y
+CONFIG_HID_OUYA=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PENMOUNT=y
+CONFIG_HID_PETALYNX=y
+# CONFIG_HID_PICOLCD is not set
+# CONFIG_HID_PLANTRONICS is not set
+# CONFIG_HID_PXRC is not set
+# CONFIG_HID_RAZER is not set
+# CONFIG_HID_PRIMAX is not set
+# CONFIG_HID_RETRODE is not set
+# CONFIG_HID_ROCCAT is not set
+# CONFIG_HID_SAITEK is not set
+CONFIG_HID_SAMSUNG=y
+# CONFIG_HID_SEMITEK is not set
+# CONFIG_HID_SIGMAMICRO is not set
+CONFIG_HID_SONY=y
+CONFIG_SONY_FF=y
+# CONFIG_HID_SPEEDLINK is not set
+CONFIG_HID_STEAM=m
+# CONFIG_HID_STEELSERIES is not set
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_RMI=y
+# CONFIG_HID_GREENASIA is not set
+CONFIG_HID_SMARTJOYPLUS=m
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+# CONFIG_HID_TOPRE is not set
+# CONFIG_HID_THINGM is not set
+# CONFIG_HID_THRUSTMASTER is not set
+# CONFIG_HID_UDRAW_PS3 is not set
+# CONFIG_HID_U2FZERO is not set
+# CONFIG_HID_WACOM is not set
+CONFIG_HID_WIIMOTE=m
+CONFIG_HID_XINMO=y
+# CONFIG_HID_ZEROPLUS is not set
+CONFIG_HID_ZYDACRON=y
+# CONFIG_HID_SENSOR_HUB is not set
+# CONFIG_HID_ALPS is not set
+# CONFIG_HID_MCP2221 is not set
+# end of Special HID drivers
+
+#
+# USB HID support
+#
+CONFIG_USB_HID=y
+# CONFIG_HID_PID is not set
+CONFIG_USB_HIDDEV=y
+# end of USB HID support
+
+#
+# I2C HID support
+#
+# CONFIG_I2C_HID_OF is not set
+# CONFIG_I2C_HID_OF_ELAN is not set
+# CONFIG_I2C_HID_OF_GOODIX is not set
+# end of I2C HID support
+# end of HID support
+
+CONFIG_USB_OHCI_LITTLE_ENDIAN=y
+CONFIG_USB_SUPPORT=y
+CONFIG_USB_COMMON=y
+# CONFIG_USB_LED_TRIG is not set
+# CONFIG_USB_ULPI_BUS is not set
+# CONFIG_USB_CONN_GPIO is not set
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB=y
+CONFIG_USB_PCI=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+
+#
+# Miscellaneous USB options
+#
+CONFIG_USB_DEFAULT_PERSIST=y
+# CONFIG_USB_FEW_INIT_RETRIES is not set
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_OTG is not set
+# CONFIG_USB_OTG_PRODUCTLIST is not set
+# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set
+# CONFIG_USB_LEDS_TRIGGER_USBPORT is not set
+CONFIG_USB_AUTOSUSPEND_DELAY=2
+CONFIG_USB_MON=m
+
+#
+# USB Host Controller Drivers
+#
+# CONFIG_USB_C67X00_HCD is not set
+CONFIG_USB_XHCI_HCD=y
+# CONFIG_USB_XHCI_DBGCAP is not set
+CONFIG_USB_XHCI_PCI=y
+# CONFIG_USB_XHCI_PCI_RENESAS is not set
+CONFIG_USB_XHCI_PLATFORM=y
+# CONFIG_USB_BRCMSTB is not set
+# CONFIG_USB_EHCI_HCD is not set
+# CONFIG_USB_OXU210HP_HCD is not set
+# CONFIG_USB_ISP116X_HCD is not set
+# CONFIG_USB_FOTG210_HCD is not set
+# CONFIG_USB_MAX3421_HCD is not set
+# CONFIG_USB_OHCI_HCD is not set
+# CONFIG_USB_UHCI_HCD is not set
+# CONFIG_USB_SL811_HCD is not set
+# CONFIG_USB_R8A66597_HCD is not set
+CONFIG_USB_DWCOTG=y
+# CONFIG_USB_HCD_BCMA is not set
+# CONFIG_USB_HCD_SSB is not set
+# CONFIG_USB_HCD_TEST_MODE is not set
+
+#
+# USB Device Class drivers
+#
+CONFIG_USB_ACM=m
+# CONFIG_USB_PRINTER is not set
+# CONFIG_USB_WDM is not set
+# CONFIG_USB_TMC is not set
+
+#
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
+#
+
+#
+# also be needed; see USB_STORAGE Help for more info
+#
+CONFIG_USB_STORAGE=y
+# CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_REALTEK is not set
+# CONFIG_USB_STORAGE_DATAFAB is not set
+# CONFIG_USB_STORAGE_FREECOM is not set
+# CONFIG_USB_STORAGE_ISD200 is not set
+# CONFIG_USB_STORAGE_USBAT is not set
+# CONFIG_USB_STORAGE_SDDR09 is not set
+# CONFIG_USB_STORAGE_SDDR55 is not set
+# CONFIG_USB_STORAGE_JUMPSHOT is not set
+# CONFIG_USB_STORAGE_ALAUDA is not set
+# CONFIG_USB_STORAGE_ONETOUCH is not set
+# CONFIG_USB_STORAGE_KARMA is not set
+# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
+# CONFIG_USB_STORAGE_ENE_UB6250 is not set
+CONFIG_USB_UAS=y
+
+#
+# USB Imaging devices
+#
+# CONFIG_USB_MDC800 is not set
+# CONFIG_USB_MICROTEK is not set
+# CONFIG_USBIP_CORE is not set
+# CONFIG_USB_CDNS_SUPPORT is not set
+# CONFIG_USB_MUSB_HDRC is not set
+CONFIG_USB_DWC3=y
+CONFIG_USB_DWC3_HOST=y
+
+#
+# Platform Glue Driver Support
+#
+CONFIG_USB_DWC3_HAPS=y
+CONFIG_USB_DWC3_OF_SIMPLE=y
+CONFIG_USB_DWC2=y
+CONFIG_USB_DWC2_HOST=y
+
+#
+# Gadget/Dual-role mode requires USB Gadget support to be enabled
+#
+# CONFIG_USB_DWC2_PCI is not set
+# CONFIG_USB_DWC2_DEBUG is not set
+# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set
+# CONFIG_USB_ISP1760 is not set
+
+#
+# USB port drivers
+#
+CONFIG_USB_SERIAL=m
+CONFIG_USB_SERIAL_GENERIC=y
+# CONFIG_USB_SERIAL_SIMPLE is not set
+# CONFIG_USB_SERIAL_AIRCABLE is not set
+# CONFIG_USB_SERIAL_ARK3116 is not set
+# CONFIG_USB_SERIAL_BELKIN is not set
+CONFIG_USB_SERIAL_CH341=m
+# CONFIG_USB_SERIAL_WHITEHEAT is not set
+# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set
+CONFIG_USB_SERIAL_CP210X=m
+# CONFIG_USB_SERIAL_CYPRESS_M8 is not set
+# CONFIG_USB_SERIAL_EMPEG is not set
+CONFIG_USB_SERIAL_FTDI_SIO=m
+# CONFIG_USB_SERIAL_VISOR is not set
+# CONFIG_USB_SERIAL_IPAQ is not set
+# CONFIG_USB_SERIAL_IR is not set
+# CONFIG_USB_SERIAL_EDGEPORT is not set
+# CONFIG_USB_SERIAL_EDGEPORT_TI is not set
+# CONFIG_USB_SERIAL_F81232 is not set
+# CONFIG_USB_SERIAL_F8153X is not set
+# CONFIG_USB_SERIAL_GARMIN is not set
+# CONFIG_USB_SERIAL_IPW is not set
+CONFIG_USB_SERIAL_IUU=m
+# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set
+# CONFIG_USB_SERIAL_KEYSPAN is not set
+# CONFIG_USB_SERIAL_KLSI is not set
+# CONFIG_USB_SERIAL_KOBIL_SCT is not set
+# CONFIG_USB_SERIAL_MCT_U232 is not set
+# CONFIG_USB_SERIAL_METRO is not set
+# CONFIG_USB_SERIAL_MOS7720 is not set
+# CONFIG_USB_SERIAL_MOS7840 is not set
+# CONFIG_USB_SERIAL_MXUPORT is not set
+# CONFIG_USB_SERIAL_NAVMAN is not set
+CONFIG_USB_SERIAL_PL2303=m
+# CONFIG_USB_SERIAL_OTI6858 is not set
+# CONFIG_USB_SERIAL_QCAUX is not set
+# CONFIG_USB_SERIAL_QUALCOMM is not set
+# CONFIG_USB_SERIAL_SPCP8X5 is not set
+# CONFIG_USB_SERIAL_SAFE is not set
+# CONFIG_USB_SERIAL_SIERRAWIRELESS is not set
+# CONFIG_USB_SERIAL_SYMBOL is not set
+# CONFIG_USB_SERIAL_TI is not set
+# CONFIG_USB_SERIAL_CYBERJACK is not set
+# CONFIG_USB_SERIAL_OPTION is not set
+# CONFIG_USB_SERIAL_OMNINET is not set
+# CONFIG_USB_SERIAL_OPTICON is not set
+# CONFIG_USB_SERIAL_XSENS_MT is not set
+# CONFIG_USB_SERIAL_WISHBONE is not set
+# CONFIG_USB_SERIAL_SSU100 is not set
+# CONFIG_USB_SERIAL_QT2 is not set
+# CONFIG_USB_SERIAL_UPD78F0730 is not set
+# CONFIG_USB_SERIAL_XR is not set
+# CONFIG_USB_SERIAL_DEBUG is not set
+
+#
+# USB Miscellaneous drivers
+#
+# CONFIG_USB_EMI62 is not set
+# CONFIG_USB_EMI26 is not set
+# CONFIG_USB_ADUTUX is not set
+# CONFIG_USB_SEVSEG is not set
+# CONFIG_USB_LEGOTOWER is not set
+# CONFIG_USB_LCD is not set
+# CONFIG_USB_CYPRESS_CY7C63 is not set
+# CONFIG_USB_CYTHERM is not set
+# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_FTDI_ELAN is not set
+# CONFIG_USB_APPLEDISPLAY is not set
+# CONFIG_APPLE_MFI_FASTCHARGE is not set
+# CONFIG_USB_LD is not set
+# CONFIG_USB_TRANCEVIBRATOR is not set
+# CONFIG_USB_IOWARRIOR is not set
+# CONFIG_USB_TEST is not set
+# CONFIG_USB_EHSET_TEST_FIXTURE is not set
+# CONFIG_USB_ISIGHTFW is not set
+# CONFIG_USB_YUREX is not set
+# CONFIG_USB_EZUSB_FX2 is not set
+# CONFIG_USB_HUB_USB251XB is not set
+# CONFIG_USB_HSIC_USB3503 is not set
+# CONFIG_USB_HSIC_USB4604 is not set
+# CONFIG_USB_LINK_LAYER_TEST is not set
+# CONFIG_USB_CHAOSKEY is not set
+CONFIG_BRCM_USB_PINMAP=y
+# CONFIG_USB_ONBOARD_HUB is not set
+
+#
+# USB Physical Layer drivers
+#
+CONFIG_USB_PHY=y
+CONFIG_NOP_USB_XCEIV=y
+# CONFIG_USB_GPIO_VBUS is not set
+# CONFIG_USB_ISP1301 is not set
+# CONFIG_USB_ULPI is not set
+# end of USB Physical Layer drivers
+
+# CONFIG_USB_GADGET is not set
+# CONFIG_TYPEC is not set
+CONFIG_USB_ROLE_SWITCH=y
+CONFIG_MMC=y
+CONFIG_PWRSEQ_EMMC=y
+CONFIG_PWRSEQ_SIMPLE=y
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_MINORS=32
+# CONFIG_SDIO_UART is not set
+# CONFIG_MMC_TEST is not set
+
+#
+# MMC/SD/SDIO Host Controller Drivers
+#
+# CONFIG_MMC_BCM2835_MMC is not set
+CONFIG_MMC_BCM2835_SDHOST=y
+# CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_ARMMMCI is not set
+CONFIG_MMC_SDHCI=y
+CONFIG_MMC_SDHCI_IO_ACCESSORS=y
+# CONFIG_MMC_SDHCI_PCI is not set
+CONFIG_MMC_SDHCI_PLTFM=y
+# CONFIG_MMC_SDHCI_OF_ARASAN is not set
+# CONFIG_MMC_SDHCI_OF_AT91 is not set
+CONFIG_MMC_SDHCI_OF_DWCMSHC=m
+# CONFIG_MMC_SDHCI_CADENCE is not set
+# CONFIG_MMC_SDHCI_F_SDH30 is not set
+# CONFIG_MMC_SDHCI_MILBEAUT is not set
+CONFIG_MMC_SDHCI_IPROC=y
+# CONFIG_MMC_TIFM_SD is not set
+# CONFIG_MMC_SPI is not set
+# CONFIG_MMC_CB710 is not set
+# CONFIG_MMC_VIA_SDMMC is not set
+# CONFIG_MMC_DW is not set
+# CONFIG_MMC_VUB300 is not set
+# CONFIG_MMC_USHC is not set
+# CONFIG_MMC_USDHI6ROL0 is not set
+CONFIG_MMC_REALTEK_USB=m
+CONFIG_MMC_CQHCI=y
+# CONFIG_MMC_HSQ is not set
+# CONFIG_MMC_TOSHIBA_PCI is not set
+# CONFIG_MMC_BCM2835 is not set
+# CONFIG_MMC_MTK is not set
+CONFIG_MMC_SDHCI_BRCMSTB=y
+# CONFIG_MMC_SDHCI_XENON is not set
+# CONFIG_MMC_SDHCI_OMAP is not set
+# CONFIG_MMC_SDHCI_AM654 is not set
+# CONFIG_SCSI_UFSHCD is not set
+# CONFIG_MEMSTICK is not set
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=y
+CONFIG_LEDS_CLASS_FLASH=y
+# CONFIG_LEDS_CLASS_MULTICOLOR is not set
+# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set
+
+#
+# LED drivers
+#
+# CONFIG_LEDS_AN30259A is not set
+# CONFIG_LEDS_AW2013 is not set
+# CONFIG_LEDS_BCM6328 is not set
+# CONFIG_LEDS_BCM6358 is not set
+# CONFIG_LEDS_CR0014114 is not set
+# CONFIG_LEDS_EL15203000 is not set
+# CONFIG_LEDS_LM3530 is not set
+# CONFIG_LEDS_LM3532 is not set
+# CONFIG_LEDS_LM3642 is not set
+# CONFIG_LEDS_LM3692X is not set
+# CONFIG_LEDS_PCA9532 is not set
+CONFIG_LEDS_GPIO=y
+# CONFIG_LEDS_LP3944 is not set
+# CONFIG_LEDS_LP3952 is not set
+# CONFIG_LEDS_LP50XX is not set
+# CONFIG_LEDS_LP55XX_COMMON is not set
+# CONFIG_LEDS_LP8860 is not set
+# CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_PCA963X is not set
+# CONFIG_LEDS_DAC124S085 is not set
+CONFIG_LEDS_PWM=y
+# CONFIG_LEDS_REGULATOR is not set
+# CONFIG_LEDS_BD2802 is not set
+# CONFIG_LEDS_LT3593 is not set
+# CONFIG_LEDS_TCA6507 is not set
+# CONFIG_LEDS_TLC591XX is not set
+# CONFIG_LEDS_LM355x is not set
+# CONFIG_LEDS_IS31FL319X is not set
+# CONFIG_LEDS_IS31FL32XX is not set
+
+#
+# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM)
+#
+# CONFIG_LEDS_BLINKM is not set
+# CONFIG_LEDS_SYSCON is not set
+# CONFIG_LEDS_MLXREG is not set
+# CONFIG_LEDS_USER is not set
+# CONFIG_LEDS_SPI_BYTE is not set
+# CONFIG_LEDS_TI_LMU_COMMON is not set
+
+#
+# Flash and Torch LED drivers
+#
+# CONFIG_LEDS_AAT1290 is not set
+# CONFIG_LEDS_AS3645A is not set
+# CONFIG_LEDS_KTD2692 is not set
+# CONFIG_LEDS_LM3601X is not set
+# CONFIG_LEDS_RT4505 is not set
+# CONFIG_LEDS_RT8515 is not set
+# CONFIG_LEDS_SGM3140 is not set
+
+#
+# RGB LED drivers
+#
+
+#
+# LED Triggers
+#
+CONFIG_LEDS_TRIGGERS=y
+CONFIG_LEDS_TRIGGER_TIMER=y
+CONFIG_LEDS_TRIGGER_ONESHOT=y
+CONFIG_LEDS_TRIGGER_HEARTBEAT=y
+CONFIG_LEDS_TRIGGER_BACKLIGHT=y
+CONFIG_LEDS_TRIGGER_CPU=y
+# CONFIG_LEDS_TRIGGER_ACTIVITY is not set
+CONFIG_LEDS_TRIGGER_GPIO=y
+CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
+CONFIG_LEDS_TRIGGER_TRANSIENT=y
+CONFIG_LEDS_TRIGGER_CAMERA=y
+CONFIG_LEDS_TRIGGER_INPUT=y
+# CONFIG_LEDS_TRIGGER_PANIC is not set
+# CONFIG_LEDS_TRIGGER_NETDEV is not set
+# CONFIG_LEDS_TRIGGER_PATTERN is not set
+# CONFIG_LEDS_TRIGGER_AUDIO is not set
+# CONFIG_LEDS_TRIGGER_TTY is not set
+CONFIG_LEDS_TRIGGER_ACTPWR=y
+
+#
+# Simple LED drivers
+#
+# CONFIG_ACCESSIBILITY is not set
+# CONFIG_INFINIBAND is not set
+CONFIG_EDAC_SUPPORT=y
+# CONFIG_EDAC is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+CONFIG_RTC_SYSTOHC=y
+CONFIG_RTC_SYSTOHC_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+CONFIG_RTC_NVMEM=y
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=y
+CONFIG_RTC_INTF_PROC=y
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_ABB5ZES3 is not set
+# CONFIG_RTC_DRV_ABEOZ9 is not set
+CONFIG_RTC_DRV_ABX80X=m
+CONFIG_RTC_DRV_RPI=y
+CONFIG_RTC_DRV_BRCMSTB=y
+CONFIG_RTC_DRV_DS1307=m
+# CONFIG_RTC_DRV_DS1307_CENTURY is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_HYM8563 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_NCT3018Y is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_ISL12022 is not set
+# CONFIG_RTC_DRV_ISL12026 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+CONFIG_RTC_DRV_PCF8523=m
+# CONFIG_RTC_DRV_PCF85063 is not set
+# CONFIG_RTC_DRV_PCF85363 is not set
+CONFIG_RTC_DRV_PCF8563=m
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_BQ32K is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8010 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+# CONFIG_RTC_DRV_RX8025 is not set
+# CONFIG_RTC_DRV_EM3027 is not set
+# CONFIG_RTC_DRV_RV3028 is not set
+# CONFIG_RTC_DRV_RV3032 is not set
+# CONFIG_RTC_DRV_RV8803 is not set
+# CONFIG_RTC_DRV_SD3078 is not set
+
+#
+# SPI RTC drivers
+#
+# CONFIG_RTC_DRV_M41T93 is not set
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1302 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1343 is not set
+# CONFIG_RTC_DRV_DS1347 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6916 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RX4581 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_PCF2123 is not set
+# CONFIG_RTC_DRV_MCP795 is not set
+CONFIG_RTC_I2C_AND_SPI=y
+
+#
+# SPI and I2C RTC drivers
+#
+CONFIG_RTC_DRV_DS3232=m
+CONFIG_RTC_DRV_DS3232_HWMON=y
+CONFIG_RTC_DRV_PCF2127=m
+# CONFIG_RTC_DRV_RV3029C2 is not set
+# CONFIG_RTC_DRV_RX6110 is not set
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1685_FAMILY is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_DS2404 is not set
+# CONFIG_RTC_DRV_EFI is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_MSM6242 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_RP5C01 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+# CONFIG_RTC_DRV_ZYNQMP is not set
+
+#
+# on-CPU RTC drivers
+#
+# CONFIG_RTC_DRV_PL030 is not set
+# CONFIG_RTC_DRV_PL031 is not set
+# CONFIG_RTC_DRV_CADENCE is not set
+# CONFIG_RTC_DRV_FTRTC010 is not set
+# CONFIG_RTC_DRV_R7301 is not set
+
+#
+# HID Sensor RTC drivers
+#
+# CONFIG_RTC_DRV_GOLDFISH is not set
+CONFIG_DMADEVICES=y
+# CONFIG_DMADEVICES_DEBUG is not set
+
+#
+# DMA Devices
+#
+CONFIG_DMA_ENGINE=y
+CONFIG_DMA_VIRTUAL_CHANNELS=y
+CONFIG_DMA_OF=y
+# CONFIG_ALTERA_MSGDMA is not set
+# CONFIG_AMBA_PL08X is not set
+# CONFIG_BCM_SBA_RAID is not set
+CONFIG_DMA_BCM2835=y
+CONFIG_DW_AXI_DMAC=y
+# CONFIG_FSL_EDMA is not set
+# CONFIG_FSL_QDMA is not set
+# CONFIG_INTEL_IDMA64 is not set
+# CONFIG_MV_XOR_V2 is not set
+# CONFIG_PL330_DMA is not set
+# CONFIG_PLX_DMA is not set
+# CONFIG_DMA_BCM2708 is not set
+# CONFIG_XILINX_DMA is not set
+# CONFIG_XILINX_ZYNQMP_DMA is not set
+# CONFIG_XILINX_ZYNQMP_DPDMA is not set
+# CONFIG_QCOM_HIDMA_MGMT is not set
+# CONFIG_QCOM_HIDMA is not set
+# CONFIG_DW_DMAC is not set
+# CONFIG_DW_DMAC_PCI is not set
+# CONFIG_DW_EDMA is not set
+# CONFIG_DW_EDMA_PCIE is not set
+# CONFIG_SF_PDMA is not set
+
+#
+# DMA Clients
+#
+# CONFIG_ASYNC_TX_DMA is not set
+# CONFIG_DMATEST is not set
+
+#
+# DMABUF options
+#
+CONFIG_SYNC_FILE=y
+# CONFIG_SW_SYNC is not set
+# CONFIG_UDMABUF is not set
+# CONFIG_DMABUF_MOVE_NOTIFY is not set
+# CONFIG_DMABUF_DEBUG is not set
+# CONFIG_DMABUF_SELFTESTS is not set
+CONFIG_DMABUF_HEAPS=y
+# CONFIG_DMABUF_SYSFS_STATS is not set
+CONFIG_DMABUF_HEAPS_SYSTEM=y
+CONFIG_DMABUF_HEAPS_CMA=y
+# end of DMABUF options
+
+# CONFIG_AUXDISPLAY is not set
+# CONFIG_UIO is not set
+# CONFIG_VFIO is not set
+# CONFIG_VIRT_DRIVERS is not set
+# CONFIG_VIRTIO_MENU is not set
+# CONFIG_VDPA is not set
+CONFIG_VHOST_MENU=y
+# CONFIG_VHOST_NET is not set
+# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set
+
+#
+# Microsoft Hyper-V guest support
+#
+# end of Microsoft Hyper-V guest support
+
+# CONFIG_GREYBUS is not set
+# CONFIG_COMEDI is not set
+CONFIG_STAGING=y
+# CONFIG_PRISM2_USB is not set
+# CONFIG_RTL8192U is not set
+# CONFIG_RTLLIB is not set
+CONFIG_RTL8723BS=m
+CONFIG_R8712U=m
+CONFIG_R8188EU=m
+# CONFIG_RTS5208 is not set
+# CONFIG_VT6655 is not set
+CONFIG_VT6656=m
+# CONFIG_FB_SM750 is not set
+CONFIG_STAGING_MEDIA=y
+# CONFIG_VIDEO_MAX96712 is not set
+CONFIG_VIDEO_RPIVID=m
+# CONFIG_STAGING_MEDIA_DEPRECATED is not set
+# CONFIG_STAGING_BOARD is not set
+# CONFIG_LTE_GDM724X is not set
+# CONFIG_FB_TFT is not set
+# CONFIG_KS7010 is not set
+# CONFIG_BCM_VIDEOCORE is not set
+# CONFIG_PI433 is not set
+# CONFIG_XIL_AXIS_FIFO is not set
+# CONFIG_FIELDBUS_DEV is not set
+# CONFIG_QLGE is not set
+# CONFIG_VME_BUS is not set
+# CONFIG_GOLDFISH is not set
+# CONFIG_CHROME_PLATFORMS is not set
+# CONFIG_MELLANOX_PLATFORM is not set
+CONFIG_SURFACE_PLATFORMS=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_CLK_PREPARE=y
+CONFIG_COMMON_CLK=y
+
+#
+# Clock driver for ARM Reference designs
+#
+# CONFIG_CLK_ICST is not set
+# CONFIG_CLK_SP810 is not set
+# end of Clock driver for ARM Reference designs
+
+# CONFIG_LMK04832 is not set
+# CONFIG_COMMON_CLK_MAX9485 is not set
+CONFIG_COMMON_CLK_RP1=y
+CONFIG_COMMON_CLK_RP1_SDIO=y
+CONFIG_COMMON_CLK_HIFIBERRY_DACPLUSHD=m
+CONFIG_COMMON_CLK_HIFIBERRY_DACPRO=m
+# CONFIG_COMMON_CLK_SI5341 is not set
+# CONFIG_COMMON_CLK_SI5351 is not set
+# CONFIG_COMMON_CLK_SI514 is not set
+# CONFIG_COMMON_CLK_SI544 is not set
+# CONFIG_COMMON_CLK_SI570 is not set
+# CONFIG_COMMON_CLK_CDCE706 is not set
+# CONFIG_COMMON_CLK_CDCE925 is not set
+# CONFIG_COMMON_CLK_CS2000_CP is not set
+# CONFIG_COMMON_CLK_AXI_CLKGEN is not set
+# CONFIG_COMMON_CLK_XGENE is not set
+# CONFIG_COMMON_CLK_PWM is not set
+# CONFIG_COMMON_CLK_RS9_PCIE is not set
+# CONFIG_COMMON_CLK_VC5 is not set
+# CONFIG_COMMON_CLK_VC7 is not set
+# CONFIG_COMMON_CLK_FIXED_MMIO is not set
+CONFIG_CLK_BCM2711_DVP=y
+CONFIG_CLK_BCM2835=y
+CONFIG_CLK_RASPBERRYPI=y
+# CONFIG_XILINX_VCU is not set
+# CONFIG_COMMON_CLK_XLNX_CLKWZRD is not set
+# CONFIG_HWSPINLOCK is not set
+
+#
+# Clock Source drivers
+#
+CONFIG_TIMER_OF=y
+CONFIG_TIMER_PROBE=y
+CONFIG_CLKSRC_MMIO=y
+CONFIG_ARM_ARCH_TIMER=y
+CONFIG_ARM_ARCH_TIMER_EVTSTREAM=y
+CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND=y
+CONFIG_FSL_ERRATUM_A008585=y
+CONFIG_HISILICON_ERRATUM_161010101=y
+CONFIG_ARM64_ERRATUM_858921=y
+CONFIG_ARM_TIMER_SP804=y
+# CONFIG_MICROCHIP_PIT64B is not set
+# end of Clock Source drivers
+
+CONFIG_MAILBOX=y
+# CONFIG_ARM_MHU is not set
+# CONFIG_ARM_MHU_V2 is not set
+# CONFIG_PLATFORM_MHU is not set
+# CONFIG_PL320_MBOX is not set
+# CONFIG_ALTERA_MBOX is not set
+CONFIG_BCM2835_MBOX=y
+# CONFIG_MAILBOX_TEST is not set
+CONFIG_IOMMU_IOVA=y
+CONFIG_IOMMU_API=y
+CONFIG_IOMMU_SUPPORT=y
+
+#
+# Generic IOMMU Pagetable Support
+#
+# CONFIG_IOMMU_IO_PGTABLE_LPAE is not set
+# CONFIG_IOMMU_IO_PGTABLE_ARMV7S is not set
+# CONFIG_IOMMU_IO_PGTABLE_DART is not set
+# end of Generic IOMMU Pagetable Support
+
+# CONFIG_IOMMU_DEBUGFS is not set
+CONFIG_IOMMU_DEFAULT_DMA_STRICT=y
+# CONFIG_IOMMU_DEFAULT_DMA_LAZY is not set
+# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set
+CONFIG_OF_IOMMU=y
+CONFIG_IOMMU_DMA=y
+# CONFIG_ARM_SMMU is not set
+# CONFIG_ARM_SMMU_V3 is not set
+CONFIG_BCM2712_IOMMU=y
+
+#
+# Remoteproc drivers
+#
+# CONFIG_REMOTEPROC is not set
+# end of Remoteproc drivers
+
+#
+# Rpmsg drivers
+#
+# CONFIG_RPMSG_QCOM_GLINK_RPM is not set
+# CONFIG_RPMSG_VIRTIO is not set
+# end of Rpmsg drivers
+
+# CONFIG_SOUNDWIRE is not set
+
+#
+# SOC (System On Chip) specific Drivers
+#
+
+#
+# Amlogic SoC drivers
+#
+# end of Amlogic SoC drivers
+
+#
+# Broadcom SoC drivers
+#
+CONFIG_BCM2835_POWER=y
+CONFIG_RASPBERRYPI_POWER=y
+CONFIG_SOC_BRCMSTB=y
+CONFIG_BRCMSTB_PM=y
+# end of Broadcom SoC drivers
+
+#
+# NXP/Freescale QorIQ SoC drivers
+#
+# CONFIG_QUICC_ENGINE is not set
+# end of NXP/Freescale QorIQ SoC drivers
+
+#
+# fujitsu SoC drivers
+#
+# end of fujitsu SoC drivers
+
+#
+# i.MX SoC drivers
+#
+# end of i.MX SoC drivers
+
+#
+# Enable LiteX SoC Builder specific drivers
+#
+# CONFIG_LITEX_SOC_CONTROLLER is not set
+# end of Enable LiteX SoC Builder specific drivers
+
+#
+# Qualcomm SoC drivers
+#
+# end of Qualcomm SoC drivers
+
+# CONFIG_SOC_TI is not set
+
+#
+# Xilinx SoC drivers
+#
+# end of Xilinx SoC drivers
+# end of SOC (System On Chip) specific Drivers
+
+# CONFIG_PM_DEVFREQ is not set
+CONFIG_EXTCON=y
+
+#
+# Extcon Device Drivers
+#
+# CONFIG_EXTCON_FSA9480 is not set
+# CONFIG_EXTCON_GPIO is not set
+# CONFIG_EXTCON_MAX3355 is not set
+# CONFIG_EXTCON_PTN5150 is not set
+# CONFIG_EXTCON_RT8973A is not set
+# CONFIG_EXTCON_SM5502 is not set
+# CONFIG_EXTCON_USB_GPIO is not set
+# CONFIG_MEMORY is not set
+# CONFIG_IIO is not set
+# CONFIG_NTB is not set
+CONFIG_PWM=y
+CONFIG_PWM_SYSFS=y
+# CONFIG_PWM_DEBUG is not set
+# CONFIG_PWM_ATMEL_TCB is not set
+CONFIG_PWM_BCM2835=m
+CONFIG_PWM_BRCMSTB=y
+# CONFIG_PWM_CLK is not set
+# CONFIG_PWM_DWC is not set
+# CONFIG_PWM_FSL_FTM is not set
+# CONFIG_PWM_PCA9685 is not set
+CONFIG_PWM_RASPBERRYPI_POE=m
+CONFIG_PWM_RP1=y
+# CONFIG_PWM_XILINX is not set
+
+#
+# IRQ chip support
+#
+CONFIG_IRQCHIP=y
+CONFIG_ARM_GIC=y
+CONFIG_ARM_GIC_MAX_NR=1
+CONFIG_ARM_GIC_V2M=y
+CONFIG_ARM_GIC_V3=y
+CONFIG_ARM_GIC_V3_ITS=y
+CONFIG_ARM_GIC_V3_ITS_PCI=y
+# CONFIG_AL_FIC is not set
+CONFIG_BCM2712_MIP=y
+CONFIG_BCM7038_L1_IRQ=y
+CONFIG_BCM7120_L2_IRQ=y
+CONFIG_BRCMSTB_L2_IRQ=y
+# CONFIG_XILINX_INTC is not set
+CONFIG_PARTITION_PERCPU=y
+# end of IRQ chip support
+
+# CONFIG_IPACK_BUS is not set
+CONFIG_ARCH_HAS_RESET_CONTROLLER=y
+CONFIG_RESET_CONTROLLER=y
+CONFIG_RESET_BRCMSTB=y
+CONFIG_RESET_BRCMSTB_RESCAL=y
+CONFIG_RESET_RASPBERRYPI=y
+CONFIG_RESET_SIMPLE=y
+# CONFIG_RESET_TI_SYSCON is not set
+# CONFIG_RESET_TI_TPS380X is not set
+
+#
+# PHY Subsystem
+#
+CONFIG_GENERIC_PHY=y
+CONFIG_GENERIC_PHY_MIPI_DPHY=y
+# CONFIG_PHY_XGENE is not set
+# CONFIG_PHY_CAN_TRANSCEIVER is not set
+
+#
+# PHY drivers for Broadcom platforms
+#
+# CONFIG_BCM_KONA_USB2_PHY is not set
+# CONFIG_PHY_BRCM_SATA is not set
+CONFIG_PHY_BRCM_USB=y
+# end of PHY drivers for Broadcom platforms
+
+# CONFIG_PHY_CADENCE_TORRENT is not set
+# CONFIG_PHY_CADENCE_DPHY is not set
+# CONFIG_PHY_CADENCE_DPHY_RX is not set
+# CONFIG_PHY_CADENCE_SIERRA is not set
+# CONFIG_PHY_CADENCE_SALVO is not set
+# CONFIG_PHY_PXA_28NM_HSIC is not set
+# CONFIG_PHY_PXA_28NM_USB2 is not set
+# CONFIG_PHY_LAN966X_SERDES is not set
+# CONFIG_PHY_MAPPHONE_MDM6600 is not set
+# CONFIG_PHY_OCELOT_SERDES is not set
+# CONFIG_PHY_SAMSUNG_USB2 is not set
+# end of PHY Subsystem
+
+# CONFIG_POWERCAP is not set
+# CONFIG_MCB is not set
+
+#
+# Performance monitor support
+#
+# CONFIG_ARM_CCI_PMU is not set
+# CONFIG_ARM_CCN is not set
+# CONFIG_ARM_CMN is not set
+CONFIG_ARM_PMU=y
+# CONFIG_ARM_DSU_PMU is not set
+# CONFIG_ARM_SPE_PMU is not set
+CONFIG_RPI_AXIPERF=m
+# CONFIG_HISI_PCIE_PMU is not set
+# CONFIG_HNS3_PMU is not set
+# end of Performance monitor support
+
+CONFIG_RAS=y
+# CONFIG_USB4 is not set
+
+#
+# Android
+#
+# CONFIG_ANDROID_BINDER_IPC is not set
+# end of Android
+
+# CONFIG_LIBNVDIMM is not set
+# CONFIG_DAX is not set
+CONFIG_NVMEM=y
+CONFIG_NVMEM_SYSFS=y
+CONFIG_NVMEM_RMEM=m
+
+#
+# HW tracing support
+#
+# CONFIG_STM is not set
+# CONFIG_INTEL_TH is not set
+# CONFIG_HISI_PTT is not set
+# end of HW tracing support
+
+# CONFIG_FPGA is not set
+# CONFIG_FSI is not set
+# CONFIG_TEE is not set
+CONFIG_PM_OPP=y
+# CONFIG_SIOX is not set
+# CONFIG_SLIMBUS is not set
+# CONFIG_INTERCONNECT is not set
+# CONFIG_COUNTER is not set
+# CONFIG_MOST is not set
+# CONFIG_PECI is not set
+# CONFIG_HTE is not set
+# end of Device Drivers
+
+#
+# File systems
+#
+CONFIG_DCACHE_WORD_ACCESS=y
+CONFIG_VALIDATE_FS_PARSER=y
+CONFIG_FS_IOMAP=y
+# CONFIG_EXT2_FS is not set
+# CONFIG_EXT3_FS is not set
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_USE_FOR_EXT2=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_EXT4_FS_SECURITY=y
+# CONFIG_EXT4_DEBUG is not set
+CONFIG_JBD2=y
+# CONFIG_JBD2_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+CONFIG_REISERFS_FS=m
+# CONFIG_REISERFS_CHECK is not set
+# CONFIG_REISERFS_PROC_INFO is not set
+# CONFIG_REISERFS_FS_XATTR is not set
+CONFIG_JFS_FS=m
+# CONFIG_JFS_POSIX_ACL is not set
+# CONFIG_JFS_SECURITY is not set
+# CONFIG_JFS_DEBUG is not set
+# CONFIG_JFS_STATISTICS is not set
+CONFIG_XFS_FS=m
+CONFIG_XFS_SUPPORT_V4=y
+# CONFIG_XFS_QUOTA is not set
+# CONFIG_XFS_POSIX_ACL is not set
+# CONFIG_XFS_RT is not set
+# CONFIG_XFS_ONLINE_SCRUB is not set
+# CONFIG_XFS_WARN is not set
+# CONFIG_XFS_DEBUG is not set
+# CONFIG_GFS2_FS is not set
+# CONFIG_OCFS2_FS is not set
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
+# CONFIG_BTRFS_FS_REF_VERIFY is not set
+# CONFIG_NILFS2_FS is not set
+# CONFIG_F2FS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+CONFIG_EXPORTFS=y
+# CONFIG_EXPORTFS_BLOCK_OPS is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_FS_ENCRYPTION is not set
+# CONFIG_FS_VERITY is not set
+CONFIG_FSNOTIFY=y
+CONFIG_DNOTIFY=y
+CONFIG_INOTIFY_USER=y
+CONFIG_FANOTIFY=y
+# CONFIG_QUOTA is not set
+CONFIG_AUTOFS4_FS=y
+CONFIG_AUTOFS_FS=y
+CONFIG_FUSE_FS=m
+# CONFIG_CUSE is not set
+# CONFIG_VIRTIO_FS is not set
+CONFIG_OVERLAY_FS=m
+# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set
+CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y
+# CONFIG_OVERLAY_FS_INDEX is not set
+# CONFIG_OVERLAY_FS_XINO_AUTO is not set
+# CONFIG_OVERLAY_FS_METACOPY is not set
+
+#
+# Caches
+#
+CONFIG_NETFS_SUPPORT=y
+# CONFIG_NETFS_STATS is not set
+CONFIG_FSCACHE=y
+# CONFIG_FSCACHE_STATS is not set
+# CONFIG_FSCACHE_DEBUG is not set
+# CONFIG_CACHEFILES is not set
+# end of Caches
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_UDF_FS=y
+# end of CD-ROM/DVD Filesystems
+
+#
+# DOS/FAT/EXFAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+# CONFIG_MSDOS_FS is not set
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_FAT_DEFAULT_UTF8 is not set
+CONFIG_EXFAT_FS=m
+CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8"
+# CONFIG_NTFS_FS is not set
+CONFIG_NTFS3_FS=m
+# CONFIG_NTFS3_64BIT_CLUSTER is not set
+# CONFIG_NTFS3_LZX_XPRESS is not set
+# CONFIG_NTFS3_FS_POSIX_ACL is not set
+# end of DOS/FAT/EXFAT/NT Filesystems
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+# CONFIG_PROC_KCORE is not set
+CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
+# CONFIG_PROC_CHILDREN is not set
+CONFIG_KERNFS=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_TMPFS_XATTR=y
+# CONFIG_TMPFS_INODE64 is not set
+CONFIG_ARCH_SUPPORTS_HUGETLBFS=y
+# CONFIG_HUGETLBFS is not set
+CONFIG_MEMFD_CREATE=y
+CONFIG_ARCH_HAS_GIGANTIC_PAGE=y
+CONFIG_CONFIGFS_FS=y
+CONFIG_EFIVAR_FS=m
+# end of Pseudo filesystems
+
+CONFIG_MISC_FILESYSTEMS=y
+# CONFIG_ORANGEFS_FS is not set
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_ECRYPT_FS is not set
+CONFIG_HFS_FS=y
+CONFIG_HFSPLUS_FS=y
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_CRAMFS is not set
+CONFIG_SQUASHFS=y
+# CONFIG_SQUASHFS_FILE_CACHE is not set
+CONFIG_SQUASHFS_FILE_DIRECT=y
+# CONFIG_SQUASHFS_DECOMP_SINGLE is not set
+# CONFIG_SQUASHFS_DECOMP_MULTI is not set
+CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y
+# CONFIG_SQUASHFS_XATTR is not set
+CONFIG_SQUASHFS_ZLIB=y
+CONFIG_SQUASHFS_LZ4=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+CONFIG_SQUASHFS_ZSTD=y
+# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set
+# CONFIG_SQUASHFS_EMBEDDED is not set
+CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3
+# CONFIG_VXFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_OMFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_QNX6FS_FS is not set
+# CONFIG_ROMFS_FS is not set
+CONFIG_PSTORE=y
+CONFIG_PSTORE_DEFAULT_KMSG_BYTES=10240
+CONFIG_PSTORE_DEFLATE_COMPRESS=y
+# CONFIG_PSTORE_LZO_COMPRESS is not set
+# CONFIG_PSTORE_LZ4_COMPRESS is not set
+# CONFIG_PSTORE_LZ4HC_COMPRESS is not set
+# CONFIG_PSTORE_842_COMPRESS is not set
+# CONFIG_PSTORE_ZSTD_COMPRESS is not set
+CONFIG_PSTORE_COMPRESS=y
+CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT=y
+CONFIG_PSTORE_COMPRESS_DEFAULT="deflate"
+CONFIG_PSTORE_CONSOLE=y
+# CONFIG_PSTORE_PMSG is not set
+# CONFIG_PSTORE_FTRACE is not set
+CONFIG_PSTORE_RAM=y
+# CONFIG_PSTORE_BLK is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_EROFS_FS is not set
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V2=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+CONFIG_NFS_V4=y
+CONFIG_NFS_SWAP=y
+CONFIG_NFS_V4_1=y
+CONFIG_NFS_V4_2=y
+CONFIG_PNFS_FILE_LAYOUT=y
+CONFIG_PNFS_BLOCK=m
+CONFIG_PNFS_FLEXFILE_LAYOUT=y
+CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org"
+CONFIG_NFS_V4_1_MIGRATION=y
+CONFIG_ROOT_NFS=y
+CONFIG_NFS_FSCACHE=y
+# CONFIG_NFS_USE_LEGACY_DNS is not set
+CONFIG_NFS_USE_KERNEL_DNS=y
+CONFIG_NFS_DISABLE_UDP_SUPPORT=y
+# CONFIG_NFS_V4_2_READ_PLUS is not set
+# CONFIG_NFSD is not set
+CONFIG_GRACE_PERIOD=y
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_COMMON=y
+CONFIG_NFS_V4_2_SSC_HELPER=y
+CONFIG_SUNRPC=y
+CONFIG_SUNRPC_GSS=y
+CONFIG_SUNRPC_BACKCHANNEL=y
+CONFIG_SUNRPC_SWAP=y
+CONFIG_RPCSEC_GSS_KRB5=m
+# CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES is not set
+# CONFIG_SUNRPC_DEBUG is not set
+# CONFIG_CEPH_FS is not set
+CONFIG_CIFS=y
+CONFIG_CIFS_STATS2=y
+CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y
+# CONFIG_CIFS_UPCALL is not set
+# CONFIG_CIFS_XATTR is not set
+CONFIG_CIFS_DEBUG=y
+# CONFIG_CIFS_DEBUG2 is not set
+# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set
+# CONFIG_CIFS_DFS_UPCALL is not set
+# CONFIG_CIFS_SWN_UPCALL is not set
+CONFIG_CIFS_FSCACHE=y
+# CONFIG_CIFS_ROOT is not set
+# CONFIG_SMB_SERVER is not set
+CONFIG_SMBFS=y
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+# CONFIG_NLS_CODEPAGE_932 is not set
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+# CONFIG_NLS_MAC_ROMAN is not set
+# CONFIG_NLS_MAC_CELTIC is not set
+# CONFIG_NLS_MAC_CENTEURO is not set
+# CONFIG_NLS_MAC_CROATIAN is not set
+# CONFIG_NLS_MAC_CYRILLIC is not set
+# CONFIG_NLS_MAC_GAELIC is not set
+# CONFIG_NLS_MAC_GREEK is not set
+# CONFIG_NLS_MAC_ICELAND is not set
+# CONFIG_NLS_MAC_INUIT is not set
+# CONFIG_NLS_MAC_ROMANIAN is not set
+# CONFIG_NLS_MAC_TURKISH is not set
+CONFIG_NLS_UTF8=y
+# CONFIG_DLM is not set
+# CONFIG_UNICODE is not set
+CONFIG_IO_WQ=y
+# end of File systems
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+# CONFIG_KEYS_REQUEST_CACHE is not set
+# CONFIG_PERSISTENT_KEYRINGS is not set
+# CONFIG_TRUSTED_KEYS is not set
+# CONFIG_ENCRYPTED_KEYS is not set
+CONFIG_KEY_DH_OPERATIONS=y
+# CONFIG_SECURITY_DMESG_RESTRICT is not set
+# CONFIG_SECURITY is not set
+# CONFIG_SECURITYFS is not set
+CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y
+# CONFIG_HARDENED_USERCOPY is not set
+# CONFIG_FORTIFY_SOURCE is not set
+# CONFIG_STATIC_USERMODEHELPER is not set
+# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set
+CONFIG_DEFAULT_SECURITY_DAC=y
+CONFIG_LSM="yama,loadpin,safesetid,integrity"
+
+#
+# Kernel hardening options
+#
+
+#
+# Memory initialization
+#
+CONFIG_CC_HAS_AUTO_VAR_INIT_PATTERN=y
+CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO_BARE=y
+CONFIG_CC_HAS_AUTO_VAR_INIT_ZERO=y
+CONFIG_INIT_STACK_NONE=y
+# CONFIG_INIT_STACK_ALL_PATTERN is not set
+# CONFIG_INIT_STACK_ALL_ZERO is not set
+# CONFIG_GCC_PLUGIN_STACKLEAK is not set
+# CONFIG_INIT_ON_ALLOC_DEFAULT_ON is not set
+# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set
+CONFIG_CC_HAS_ZERO_CALL_USED_REGS=y
+# CONFIG_ZERO_CALL_USED_REGS is not set
+# end of Memory initialization
+
+CONFIG_RANDSTRUCT_NONE=y
+# CONFIG_RANDSTRUCT_FULL is not set
+# CONFIG_RANDSTRUCT_PERFORMANCE is not set
+# end of Kernel hardening options
+# end of Security options
+
+CONFIG_XOR_BLOCKS=m
+CONFIG_CRYPTO=y
+
+#
+# Crypto core or helper
+#
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_SKCIPHER=y
+CONFIG_CRYPTO_SKCIPHER2=y
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_RNG_DEFAULT=y
+CONFIG_CRYPTO_AKCIPHER2=y
+CONFIG_CRYPTO_AKCIPHER=y
+CONFIG_CRYPTO_KPP2=y
+CONFIG_CRYPTO_KPP=y
+CONFIG_CRYPTO_ACOMP2=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
+# CONFIG_CRYPTO_USER is not set
+CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
+CONFIG_CRYPTO_GF128MUL=y
+CONFIG_CRYPTO_NULL=y
+CONFIG_CRYPTO_NULL2=y
+# CONFIG_CRYPTO_PCRYPT is not set
+CONFIG_CRYPTO_CRYPTD=y
+CONFIG_CRYPTO_AUTHENC=y
+# CONFIG_CRYPTO_TEST is not set
+# end of Crypto core or helper
+
+#
+# Public-key cryptography
+#
+CONFIG_CRYPTO_RSA=y
+CONFIG_CRYPTO_DH=y
+# CONFIG_CRYPTO_DH_RFC7919_GROUPS is not set
+CONFIG_CRYPTO_ECC=m
+CONFIG_CRYPTO_ECDH=m
+# CONFIG_CRYPTO_ECDSA is not set
+# CONFIG_CRYPTO_ECRDSA is not set
+# CONFIG_CRYPTO_SM2 is not set
+# CONFIG_CRYPTO_CURVE25519 is not set
+# end of Public-key cryptography
+
+#
+# Block ciphers
+#
+CONFIG_CRYPTO_AES=y
+# CONFIG_CRYPTO_AES_TI is not set
+# CONFIG_CRYPTO_ANUBIS is not set
+# CONFIG_CRYPTO_ARIA is not set
+# CONFIG_CRYPTO_BLOWFISH is not set
+# CONFIG_CRYPTO_CAMELLIA is not set
+# CONFIG_CRYPTO_CAST5 is not set
+# CONFIG_CRYPTO_CAST6 is not set
+CONFIG_CRYPTO_DES=y
+# CONFIG_CRYPTO_FCRYPT is not set
+# CONFIG_CRYPTO_KHAZAD is not set
+# CONFIG_CRYPTO_SEED is not set
+# CONFIG_CRYPTO_SERPENT is not set
+CONFIG_CRYPTO_SM4=m
+# CONFIG_CRYPTO_SM4_GENERIC is not set
+# CONFIG_CRYPTO_TEA is not set
+# CONFIG_CRYPTO_TWOFISH is not set
+# end of Block ciphers
+
+#
+# Length-preserving ciphers and modes
+#
+# CONFIG_CRYPTO_ADIANTUM is not set
+# CONFIG_CRYPTO_ARC4 is not set
+# CONFIG_CRYPTO_CHACHA20 is not set
+CONFIG_CRYPTO_CBC=y
+CONFIG_CRYPTO_CFB=m
+CONFIG_CRYPTO_CTR=y
+CONFIG_CRYPTO_CTS=m
+CONFIG_CRYPTO_ECB=y
+# CONFIG_CRYPTO_HCTR2 is not set
+# CONFIG_CRYPTO_KEYWRAP is not set
+# CONFIG_CRYPTO_LRW is not set
+CONFIG_CRYPTO_OFB=m
+# CONFIG_CRYPTO_PCBC is not set
+# CONFIG_CRYPTO_XTS is not set
+# end of Length-preserving ciphers and modes
+
+#
+# AEAD (authenticated encryption with associated data) ciphers
+#
+# CONFIG_CRYPTO_AEGIS128 is not set
+# CONFIG_CRYPTO_CHACHA20POLY1305 is not set
+CONFIG_CRYPTO_CCM=y
+CONFIG_CRYPTO_GCM=y
+CONFIG_CRYPTO_SEQIV=y
+CONFIG_CRYPTO_ECHAINIV=y
+# CONFIG_CRYPTO_ESSIV is not set
+# end of AEAD (authenticated encryption with associated data) ciphers
+
+#
+# Hashes, digests, and MACs
+#
+CONFIG_CRYPTO_BLAKE2B=m
+CONFIG_CRYPTO_CMAC=y
+CONFIG_CRYPTO_GHASH=y
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_MD4=y
+CONFIG_CRYPTO_MD5=y
+# CONFIG_CRYPTO_MICHAEL_MIC is not set
+# CONFIG_CRYPTO_POLY1305 is not set
+# CONFIG_CRYPTO_RMD160 is not set
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_SHA512=y
+CONFIG_CRYPTO_SHA3=m
+CONFIG_CRYPTO_SM3=m
+# CONFIG_CRYPTO_SM3_GENERIC is not set
+# CONFIG_CRYPTO_STREEBOG is not set
+# CONFIG_CRYPTO_VMAC is not set
+# CONFIG_CRYPTO_WP512 is not set
+# CONFIG_CRYPTO_XCBC is not set
+CONFIG_CRYPTO_XXHASH=m
+# end of Hashes, digests, and MACs
+
+#
+# CRCs (cyclic redundancy checks)
+#
+CONFIG_CRYPTO_CRC32C=y
+CONFIG_CRYPTO_CRC32=y
+# CONFIG_CRYPTO_CRCT10DIF is not set
+# end of CRCs (cyclic redundancy checks)
+
+#
+# Compression
+#
+CONFIG_CRYPTO_DEFLATE=y
+CONFIG_CRYPTO_LZO=m
+# CONFIG_CRYPTO_842 is not set
+# CONFIG_CRYPTO_LZ4 is not set
+# CONFIG_CRYPTO_LZ4HC is not set
+# CONFIG_CRYPTO_ZSTD is not set
+# end of Compression
+
+#
+# Random number generation
+#
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_CRYPTO_DRBG_MENU=y
+CONFIG_CRYPTO_DRBG_HMAC=y
+# CONFIG_CRYPTO_DRBG_HASH is not set
+# CONFIG_CRYPTO_DRBG_CTR is not set
+CONFIG_CRYPTO_DRBG=y
+CONFIG_CRYPTO_JITTERENTROPY=y
+CONFIG_CRYPTO_KDF800108_CTR=y
+# end of Random number generation
+
+#
+# Userspace interface
+#
+CONFIG_CRYPTO_USER_API=y
+CONFIG_CRYPTO_USER_API_HASH=y
+CONFIG_CRYPTO_USER_API_SKCIPHER=y
+# CONFIG_CRYPTO_USER_API_RNG is not set
+# CONFIG_CRYPTO_USER_API_AEAD is not set
+CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE=y
+# end of Userspace interface
+
+CONFIG_CRYPTO_HASH_INFO=y
+# CONFIG_CRYPTO_NHPOLY1305_NEON is not set
+CONFIG_CRYPTO_CHACHA20_NEON=m
+
+#
+# Accelerated Cryptographic Algorithms for CPU (arm64)
+#
+CONFIG_CRYPTO_GHASH_ARM64_CE=m
+CONFIG_CRYPTO_POLY1305_NEON=m
+CONFIG_CRYPTO_SHA1_ARM64_CE=m
+CONFIG_CRYPTO_SHA256_ARM64=m
+CONFIG_CRYPTO_SHA2_ARM64_CE=m
+CONFIG_CRYPTO_SHA512_ARM64=m
+CONFIG_CRYPTO_SHA512_ARM64_CE=m
+CONFIG_CRYPTO_SHA3_ARM64=m
+# CONFIG_CRYPTO_SM3_NEON is not set
+CONFIG_CRYPTO_SM3_ARM64_CE=m
+# CONFIG_CRYPTO_POLYVAL_ARM64_CE is not set
+CONFIG_CRYPTO_AES_ARM64=y
+CONFIG_CRYPTO_AES_ARM64_CE=m
+CONFIG_CRYPTO_AES_ARM64_CE_BLK=m
+CONFIG_CRYPTO_AES_ARM64_NEON_BLK=m
+CONFIG_CRYPTO_AES_ARM64_BS=m
+CONFIG_CRYPTO_SM4_ARM64_CE=m
+# CONFIG_CRYPTO_SM4_ARM64_CE_BLK is not set
+# CONFIG_CRYPTO_SM4_ARM64_NEON_BLK is not set
+CONFIG_CRYPTO_AES_ARM64_CE_CCM=m
+# end of Accelerated Cryptographic Algorithms for CPU (arm64)
+
+# CONFIG_CRYPTO_HW is not set
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS8_PRIVATE_KEY_PARSER=m
+CONFIG_PKCS7_MESSAGE_PARSER=y
+# CONFIG_PKCS7_TEST_KEY is not set
+# CONFIG_SIGNED_PE_FILE_VERIFICATION is not set
+# CONFIG_FIPS_SIGNATURE_SELFTEST is not set
+
+#
+# Certificates for signature checking
+#
+CONFIG_SYSTEM_TRUSTED_KEYRING=y
+CONFIG_SYSTEM_TRUSTED_KEYS=""
+# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set
+# CONFIG_SECONDARY_TRUSTED_KEYRING is not set
+# CONFIG_SYSTEM_BLACKLIST_KEYRING is not set
+# end of Certificates for signature checking
+
+CONFIG_BINARY_PRINTF=y
+
+#
+# Library routines
+#
+CONFIG_RAID6_PQ=m
+CONFIG_RAID6_PQ_BENCHMARK=y
+CONFIG_LINEAR_RANGES=y
+# CONFIG_PACKING is not set
+CONFIG_BITREVERSE=y
+CONFIG_HAVE_ARCH_BITREVERSE=y
+CONFIG_GENERIC_STRNCPY_FROM_USER=y
+CONFIG_GENERIC_STRNLEN_USER=y
+CONFIG_GENERIC_NET_UTILS=y
+CONFIG_CORDIC=m
+# CONFIG_PRIME_NUMBERS is not set
+CONFIG_RATIONAL=y
+CONFIG_GENERIC_PCI_IOMAP=y
+CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y
+CONFIG_ARCH_HAS_FAST_MULTIPLIER=y
+CONFIG_ARCH_USE_SYM_ANNOTATIONS=y
+# CONFIG_INDIRECT_PIO is not set
+# CONFIG_TRACE_MMIO_ACCESS is not set
+
+#
+# Crypto library routines
+#
+CONFIG_CRYPTO_LIB_UTILS=y
+CONFIG_CRYPTO_LIB_AES=y
+CONFIG_CRYPTO_LIB_ARC4=m
+CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=y
+CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m
+CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m
+CONFIG_CRYPTO_LIB_CHACHA=m
+CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_DES=y
+CONFIG_CRYPTO_LIB_POLY1305_RSIZE=9
+CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m
+CONFIG_CRYPTO_LIB_POLY1305=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
+CONFIG_CRYPTO_LIB_SHA1=y
+CONFIG_CRYPTO_LIB_SHA256=y
+# end of Crypto library routines
+
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=y
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC64_ROCKSOFT is not set
+CONFIG_CRC_ITU_T=y
+CONFIG_CRC32=y
+# CONFIG_CRC32_SELFTEST is not set
+CONFIG_CRC32_SLICEBY8=y
+# CONFIG_CRC32_SLICEBY4 is not set
+# CONFIG_CRC32_SARWATE is not set
+# CONFIG_CRC32_BIT is not set
+# CONFIG_CRC64 is not set
+# CONFIG_CRC4 is not set
+# CONFIG_CRC7 is not set
+CONFIG_LIBCRC32C=m
+# CONFIG_CRC8 is not set
+CONFIG_XXHASH=y
+CONFIG_AUDIT_ARCH_COMPAT_GENERIC=y
+# CONFIG_RANDOM32_SELFTEST is not set
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=y
+CONFIG_LZO_COMPRESS=m
+CONFIG_LZO_DECOMPRESS=y
+CONFIG_LZ4_DECOMPRESS=y
+CONFIG_ZSTD_COMMON=y
+CONFIG_ZSTD_COMPRESS=m
+CONFIG_ZSTD_DECOMPRESS=y
+CONFIG_XZ_DEC=y
+# CONFIG_XZ_DEC_X86 is not set
+# CONFIG_XZ_DEC_POWERPC is not set
+# CONFIG_XZ_DEC_IA64 is not set
+# CONFIG_XZ_DEC_ARM is not set
+# CONFIG_XZ_DEC_ARMTHUMB is not set
+# CONFIG_XZ_DEC_SPARC is not set
+# CONFIG_XZ_DEC_MICROLZMA is not set
+# CONFIG_XZ_DEC_TEST is not set
+CONFIG_GENERIC_ALLOCATOR=y
+CONFIG_REED_SOLOMON=y
+CONFIG_REED_SOLOMON_ENC8=y
+CONFIG_REED_SOLOMON_DEC8=y
+CONFIG_XARRAY_MULTI=y
+CONFIG_ASSOCIATIVE_ARRAY=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT_MAP=y
+CONFIG_HAS_DMA=y
+CONFIG_DMA_OPS=y
+CONFIG_NEED_SG_DMA_LENGTH=y
+CONFIG_NEED_DMA_MAP_STATE=y
+CONFIG_ARCH_DMA_ADDR_T_64BIT=y
+CONFIG_DMA_DECLARE_COHERENT=y
+CONFIG_ARCH_HAS_SETUP_DMA_OPS=y
+CONFIG_ARCH_HAS_TEARDOWN_DMA_OPS=y
+CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE=y
+CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU=y
+CONFIG_ARCH_HAS_DMA_PREP_COHERENT=y
+CONFIG_SWIOTLB=y
+# CONFIG_DMA_RESTRICTED_POOL is not set
+CONFIG_DMA_NONCOHERENT_MMAP=y
+CONFIG_DMA_COHERENT_POOL=y
+CONFIG_DMA_DIRECT_REMAP=y
+CONFIG_DMA_CMA=y
+# CONFIG_DMA_PERNUMA_CMA is not set
+
+#
+# Default contiguous memory area size:
+#
+CONFIG_CMA_SIZE_MBYTES=5
+CONFIG_CMA_SIZE_SEL_MBYTES=y
+# CONFIG_CMA_SIZE_SEL_PERCENTAGE is not set
+# CONFIG_CMA_SIZE_SEL_MIN is not set
+# CONFIG_CMA_SIZE_SEL_MAX is not set
+CONFIG_CMA_ALIGNMENT=8
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_DMA_MAP_BENCHMARK is not set
+CONFIG_SGL_ALLOC=y
+# CONFIG_FORCE_NR_CPUS is not set
+CONFIG_CPU_RMAP=y
+CONFIG_DQL=y
+CONFIG_GLOB=y
+# CONFIG_GLOB_SELFTEST is not set
+CONFIG_NLATTR=y
+CONFIG_CLZ_TAB=y
+# CONFIG_IRQ_POLL is not set
+CONFIG_MPILIB=y
+CONFIG_DIMLIB=y
+CONFIG_LIBFDT=y
+CONFIG_OID_REGISTRY=y
+CONFIG_UCS2_STRING=y
+CONFIG_HAVE_GENERIC_VDSO=y
+CONFIG_GENERIC_GETTIMEOFDAY=y
+CONFIG_GENERIC_VDSO_TIME_NS=y
+CONFIG_FONT_SUPPORT=y
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+CONFIG_SG_POOL=y
+CONFIG_ARCH_STACKWALK=y
+CONFIG_STACKDEPOT=y
+CONFIG_SBITMAP=y
+# end of Library routines
+
+CONFIG_GENERIC_IOREMAP=y
+CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED=y
+
+#
+# Kernel hacking
+#
+
+#
+# printk and dmesg options
+#
+CONFIG_PRINTK_TIME=y
+# CONFIG_PRINTK_CALLER is not set
+# CONFIG_STACKTRACE_BUILD_ID is not set
+CONFIG_CONSOLE_LOGLEVEL_DEFAULT=7
+CONFIG_CONSOLE_LOGLEVEL_QUIET=4
+CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4
+# CONFIG_BOOT_PRINTK_DELAY is not set
+CONFIG_DYNAMIC_DEBUG=y
+CONFIG_DYNAMIC_DEBUG_CORE=y
+CONFIG_SYMBOLIC_ERRNAME=y
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# end of printk and dmesg options
+
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_MISC=y
+
+#
+# Compile-time checks and compiler options
+#
+CONFIG_AS_HAS_NON_CONST_LEB128=y
+CONFIG_DEBUG_INFO_NONE=y
+# CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is not set
+# CONFIG_DEBUG_INFO_DWARF4 is not set
+# CONFIG_DEBUG_INFO_DWARF5 is not set
+CONFIG_FRAME_WARN=2048
+CONFIG_STRIP_ASM_SYMS=y
+# CONFIG_READABLE_ASM is not set
+# CONFIG_HEADERS_INSTALL is not set
+# CONFIG_DEBUG_SECTION_MISMATCH is not set
+CONFIG_SECTION_MISMATCH_WARN_ONLY=y
+# CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B is not set
+CONFIG_ARCH_WANT_FRAME_POINTERS=y
+CONFIG_FRAME_POINTER=y
+# CONFIG_VMLINUX_MAP is not set
+# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set
+# end of Compile-time checks and compiler options
+
+#
+# Generic Kernel Debugging Instruments
+#
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1
+CONFIG_MAGIC_SYSRQ_SERIAL=y
+CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE=""
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_FS_ALLOW_ALL=y
+# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set
+# CONFIG_DEBUG_FS_ALLOW_NONE is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_KGDB is not set
+CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y
+# CONFIG_UBSAN is not set
+CONFIG_HAVE_ARCH_KCSAN=y
+CONFIG_HAVE_KCSAN_COMPILER=y
+# CONFIG_KCSAN is not set
+# end of Generic Kernel Debugging Instruments
+
+#
+# Networking Debugging
+#
+# CONFIG_NET_DEV_REFCNT_TRACKER is not set
+# CONFIG_NET_NS_REFCNT_TRACKER is not set
+# CONFIG_DEBUG_NET is not set
+# end of Networking Debugging
+
+#
+# Memory Debugging
+#
+# CONFIG_PAGE_EXTENSION is not set
+# CONFIG_DEBUG_PAGEALLOC is not set
+CONFIG_SLUB_DEBUG=y
+# CONFIG_SLUB_DEBUG_ON is not set
+# CONFIG_PAGE_OWNER is not set
+# CONFIG_PAGE_POISONING is not set
+# CONFIG_DEBUG_PAGE_REF is not set
+# CONFIG_DEBUG_RODATA_TEST is not set
+CONFIG_ARCH_HAS_DEBUG_WX=y
+# CONFIG_DEBUG_WX is not set
+CONFIG_GENERIC_PTDUMP=y
+# CONFIG_PTDUMP_DEBUGFS is not set
+# CONFIG_DEBUG_OBJECTS is not set
+# CONFIG_SHRINKER_DEBUG is not set
+CONFIG_HAVE_DEBUG_KMEMLEAK=y
+# CONFIG_DEBUG_KMEMLEAK is not set
+# CONFIG_DEBUG_STACK_USAGE is not set
+# CONFIG_SCHED_STACK_END_CHECK is not set
+CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y
+# CONFIG_DEBUG_VM is not set
+# CONFIG_DEBUG_VM_PGTABLE is not set
+CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y
+# CONFIG_DEBUG_VIRTUAL is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_DEBUG_PER_CPU_MAPS is not set
+CONFIG_HAVE_ARCH_KASAN=y
+CONFIG_HAVE_ARCH_KASAN_SW_TAGS=y
+CONFIG_HAVE_ARCH_KASAN_HW_TAGS=y
+CONFIG_HAVE_ARCH_KASAN_VMALLOC=y
+CONFIG_CC_HAS_KASAN_GENERIC=y
+CONFIG_CC_HAS_KASAN_SW_TAGS=y
+CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y
+# CONFIG_KASAN is not set
+CONFIG_HAVE_ARCH_KFENCE=y
+# CONFIG_KFENCE is not set
+# end of Memory Debugging
+
+# CONFIG_DEBUG_SHIRQ is not set
+
+#
+# Debug Oops, Lockups and Hangs
+#
+# CONFIG_PANIC_ON_OOPS is not set
+CONFIG_PANIC_ON_OOPS_VALUE=0
+CONFIG_PANIC_TIMEOUT=0
+# CONFIG_SOFTLOCKUP_DETECTOR is not set
+# CONFIG_DETECT_HUNG_TASK is not set
+# CONFIG_WQ_WATCHDOG is not set
+# CONFIG_TEST_LOCKUP is not set
+# end of Debug Oops, Lockups and Hangs
+
+#
+# Scheduler Debugging
+#
+# CONFIG_SCHED_DEBUG is not set
+# CONFIG_SCHEDSTATS is not set
+# end of Scheduler Debugging
+
+# CONFIG_DEBUG_TIMEKEEPING is not set
+
+#
+# Lock Debugging (spinlocks, mutexes, etc...)
+#
+CONFIG_LOCK_DEBUGGING_SUPPORT=y
+# CONFIG_PROVE_LOCKING is not set
+# CONFIG_LOCK_STAT is not set
+# CONFIG_DEBUG_RT_MUTEXES is not set
+# CONFIG_DEBUG_SPINLOCK is not set
+# CONFIG_DEBUG_MUTEXES is not set
+# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set
+# CONFIG_DEBUG_RWSEMS is not set
+# CONFIG_DEBUG_LOCK_ALLOC is not set
+# CONFIG_DEBUG_ATOMIC_SLEEP is not set
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+# CONFIG_LOCK_TORTURE_TEST is not set
+# CONFIG_WW_MUTEX_SELFTEST is not set
+# CONFIG_SCF_TORTURE_TEST is not set
+# CONFIG_CSD_LOCK_WAIT_DEBUG is not set
+# end of Lock Debugging (spinlocks, mutexes, etc...)
+
+CONFIG_TRACE_IRQFLAGS=y
+CONFIG_TRACE_IRQFLAGS_NMI=y
+# CONFIG_DEBUG_IRQFLAGS is not set
+CONFIG_STACKTRACE=y
+# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set
+# CONFIG_DEBUG_KOBJECT is not set
+
+#
+# Debug kernel data structures
+#
+# CONFIG_DEBUG_LIST is not set
+# CONFIG_DEBUG_PLIST is not set
+# CONFIG_DEBUG_SG is not set
+# CONFIG_DEBUG_NOTIFIERS is not set
+# CONFIG_BUG_ON_DATA_CORRUPTION is not set
+# CONFIG_DEBUG_MAPLE_TREE is not set
+# end of Debug kernel data structures
+
+# CONFIG_DEBUG_CREDENTIALS is not set
+
+#
+# RCU Debugging
+#
+# CONFIG_RCU_SCALE_TEST is not set
+# CONFIG_RCU_TORTURE_TEST is not set
+# CONFIG_RCU_REF_SCALE_TEST is not set
+CONFIG_RCU_CPU_STALL_TIMEOUT=21
+CONFIG_RCU_EXP_CPU_STALL_TIMEOUT=0
+# CONFIG_RCU_TRACE is not set
+# CONFIG_RCU_EQS_DEBUG is not set
+# end of RCU Debugging
+
+# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set
+# CONFIG_LATENCYTOP is not set
+CONFIG_NOP_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_HAVE_SYSCALL_TRACEPOINTS=y
+CONFIG_HAVE_C_RECORDMCOUNT=y
+CONFIG_TRACER_MAX_TRACE=y
+CONFIG_TRACE_CLOCK=y
+CONFIG_RING_BUFFER=y
+CONFIG_EVENT_TRACING=y
+CONFIG_CONTEXT_SWITCH_TRACER=y
+CONFIG_RING_BUFFER_ALLOW_SWAP=y
+CONFIG_PREEMPTIRQ_TRACEPOINTS=y
+CONFIG_TRACING=y
+CONFIG_GENERIC_TRACER=y
+CONFIG_TRACING_SUPPORT=y
+CONFIG_FTRACE=y
+CONFIG_BOOTTIME_TRACING=y
+CONFIG_FUNCTION_TRACER=y
+CONFIG_FUNCTION_GRAPH_TRACER=y
+CONFIG_DYNAMIC_FTRACE=y
+CONFIG_DYNAMIC_FTRACE_WITH_REGS=y
+# CONFIG_FUNCTION_PROFILER is not set
+CONFIG_STACK_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_SCHED_TRACER=y
+# CONFIG_HWLAT_TRACER is not set
+# CONFIG_OSNOISE_TRACER is not set
+# CONFIG_TIMERLAT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
+CONFIG_TRACER_SNAPSHOT=y
+CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
+CONFIG_BRANCH_PROFILE_NONE=y
+# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set
+# CONFIG_PROFILE_ALL_BRANCHES is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_KPROBE_EVENTS=y
+# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set
+# CONFIG_UPROBE_EVENTS is not set
+CONFIG_BPF_EVENTS=y
+CONFIG_DYNAMIC_EVENTS=y
+CONFIG_PROBE_EVENTS=y
+# CONFIG_BPF_KPROBE_OVERRIDE is not set
+CONFIG_FTRACE_MCOUNT_RECORD=y
+CONFIG_FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY=y
+# CONFIG_SYNTH_EVENTS is not set
+# CONFIG_HIST_TRIGGERS is not set
+# CONFIG_TRACE_EVENT_INJECT is not set
+# CONFIG_TRACEPOINT_BENCHMARK is not set
+# CONFIG_RING_BUFFER_BENCHMARK is not set
+# CONFIG_TRACE_EVAL_MAP_FILE is not set
+# CONFIG_FTRACE_RECORD_RECURSION is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_RING_BUFFER_STARTUP_TEST is not set
+# CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS is not set
+# CONFIG_PREEMPTIRQ_DELAY_TEST is not set
+# CONFIG_KPROBE_EVENT_GEN_TEST is not set
+# CONFIG_RV is not set
+# CONFIG_SAMPLES is not set
+# CONFIG_STRICT_DEVMEM is not set
+
+#
+# arm64 Debugging
+#
+# CONFIG_PID_IN_CONTEXTIDR is not set
+# CONFIG_ARM64_RELOC_TEST is not set
+# CONFIG_CORESIGHT is not set
+# end of arm64 Debugging
+
+#
+# Kernel Testing and Coverage
+#
+# CONFIG_KUNIT is not set
+# CONFIG_NOTIFIER_ERROR_INJECTION is not set
+CONFIG_FUNCTION_ERROR_INJECTION=y
+# CONFIG_FAULT_INJECTION is not set
+CONFIG_ARCH_HAS_KCOV=y
+CONFIG_CC_HAS_SANCOV_TRACE_PC=y
+# CONFIG_KCOV is not set
+# CONFIG_RUNTIME_TESTING_MENU is not set
+CONFIG_ARCH_USE_MEMTEST=y
+# CONFIG_MEMTEST is not set
+# end of Kernel Testing and Coverage
+
+#
+# Rust hacking
+#
+# end of Rust hacking
+# end of Kernel hacking
diff --git a/projects/RPi/devices/RPi5/options b/projects/RPi/devices/RPi5/options
new file mode 100644
index 0000000000..1377a7e468
--- /dev/null
+++ b/projects/RPi/devices/RPi5/options
@@ -0,0 +1,43 @@
+################################################################################
+# Device defaults
+################################################################################
+
+  # NOOBS supported hex versions (legacy) is not relevant for RPi4
+    unset NOOBS_HEX
+
+  # NOOBS supported model versions
+    NOOBS_SUPPORTED_MODELS='"Pi 5"'
+
+  # additional Firmware to use (dvb-firmware, misc-firmware, wlan-firmware)
+    FIRMWARE="${FIRMWARE} rpi-eeprom flashrom"
+
+  # set the addon project
+    ADDON_PROJECT="ARMv8"
+
+  # The TARGET_CPU variable controls which processor should be targeted for
+  # generated code.
+    case $TARGET_ARCH in
+      aarch64)
+        TARGET_CPU="cortex-a76"
+        TARGET_CPU_FLAGS="+crc+crypto"
+        ;;
+      arm)
+        TARGET_KERNEL_ARCH="arm64"
+        TARGET_KERNEL_PATCH_ARCH="aarch64"
+        TARGET_FLOAT="hard"
+        # cortex-a72 caused issues in the past, so use a53
+        TARGET_CPU="cortex-a53"
+        TARGET_CPU_FLAGS="+crc+crypto"
+        TARGET_FPU="neon-fp-armv8"
+        ;;
+    esac
+
+  # Kernel target
+    KERNEL_TARGET="Image"
+
+  # debug tty path
+    DEBUG_TTY="/dev/ttyAMA10"
+
+  # serial console
+    EXTRA_CMDLINE="console=ttyAMA10,115200 console=tty0"
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch b/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch
new file mode 100644
index 0000000000..ee9e032293
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0001-broadcom-cle-clif-common-simulator-add-7.1-version-o.patch
@@ -0,0 +1,332 @@
+From f62aa2640f92796ff5216da0a5d3c8f46a2855b4 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Mon, 26 Apr 2021 00:02:21 +0200
+Subject: [PATCH 001/142] broadcom(cle,clif,common,simulator): add 7.1 version
+ on the list of versions to build
+
+This adds 7.1 to the list of available V3D_VERSION, and first changes
+on the simulator needed to get it working.
+
+Note that we needed to touch all those 4 codebases because it is
+needed if we want to use V3D_DEBUG=clif with the simulator, that it is
+the easier way to see which packets a vulkan program is using.
+
+About the simulator, this commit only handle the rename of some
+registers. Any additional changes needed to get a proper support for
+v71 will be handled them on following commits.
+---
+ src/broadcom/cle/meson.build            |  3 +-
+ src/broadcom/cle/v3dx_pack.h            |  2 +
+ src/broadcom/clif/clif_private.h        |  2 +
+ src/broadcom/common/v3d_device_info.c   |  1 +
+ src/broadcom/common/v3d_macros.h        |  3 +
+ src/broadcom/meson.build                |  2 +-
+ src/broadcom/simulator/v3d_simulator.c  | 81 +++++++++++++++++++------
+ src/broadcom/simulator/v3d_simulator.h  |  5 ++
+ src/broadcom/simulator/v3dx_simulator.c | 31 ++++++++--
+ 9 files changed, 106 insertions(+), 24 deletions(-)
+
+diff --git a/src/broadcom/cle/meson.build b/src/broadcom/cle/meson.build
+index 31a0d5bfa94..8ac32b313e4 100644
+--- a/src/broadcom/cle/meson.build
++++ b/src/broadcom/cle/meson.build
+@@ -23,7 +23,8 @@ v3d_versions = [
+   [21, 21],
+   [33, 33],
+   [41, 33],
+-  [42, 33]
++  [42, 33],
++  [71, 33]
+ ]
+ 
+ v3d_xml_files = []
+diff --git a/src/broadcom/cle/v3dx_pack.h b/src/broadcom/cle/v3dx_pack.h
+index 5762e5aaa70..e5a1eb26698 100644
+--- a/src/broadcom/cle/v3dx_pack.h
++++ b/src/broadcom/cle/v3dx_pack.h
+@@ -37,6 +37,8 @@
+ #  include "cle/v3d_packet_v41_pack.h"
+ #elif (V3D_VERSION == 42)
+ #  include "cle/v3d_packet_v42_pack.h"
++#elif (V3D_VERSION == 71)
++#  include "cle/v3d_packet_v71_pack.h"
+ #else
+ #  error "Need to add a pack header include for this v3d version"
+ #endif
+diff --git a/src/broadcom/clif/clif_private.h b/src/broadcom/clif/clif_private.h
+index 6ace62b0310..cda407a00bf 100644
+--- a/src/broadcom/clif/clif_private.h
++++ b/src/broadcom/clif/clif_private.h
+@@ -101,6 +101,8 @@ bool v3d41_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
+ bool v3d42_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
+                             const uint8_t *cl, uint32_t *size, bool reloc_mode);
++bool v3d71_clif_dump_packet(struct clif_dump *clif, uint32_t offset,
++                            const uint8_t *cl, uint32_t *size, bool reloc_mode);
+ 
+ static inline void
+ out(struct clif_dump *clif, const char *fmt, ...)
+diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
+index 272190eb2e5..7e0862f1f02 100644
+--- a/src/broadcom/common/v3d_device_info.c
++++ b/src/broadcom/common/v3d_device_info.c
+@@ -66,6 +66,7 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+         case 33:
+         case 41:
+         case 42:
++        case 71:
+                 break;
+         default:
+                 fprintf(stderr,
+diff --git a/src/broadcom/common/v3d_macros.h b/src/broadcom/common/v3d_macros.h
+index fe89398208a..b4291fb5350 100644
+--- a/src/broadcom/common/v3d_macros.h
++++ b/src/broadcom/common/v3d_macros.h
+@@ -41,6 +41,9 @@
+ #elif (V3D_VERSION == 42)
+ #  define V3DX(x) V3D42_##x
+ #  define v3dX(x) v3d42_##x
++#elif (V3D_VERSION == 71)
++#  define V3DX(x) V3D71_##x
++#  define v3dX(x) v3d71_##x
+ #else
+ #  error "Need to add prefixing macros for this v3d version"
+ #endif
+diff --git a/src/broadcom/meson.build b/src/broadcom/meson.build
+index 2c10e46b188..73cb7aa0575 100644
+--- a/src/broadcom/meson.build
++++ b/src/broadcom/meson.build
+@@ -22,7 +22,7 @@ inc_broadcom = include_directories('.', 'cle')
+ 
+ subdir('cle')
+ 
+-v3d_versions = ['33', '41', '42']
++v3d_versions = ['33', '41', '42', '71']
+ v3d_libs = []
+ 
+ if with_gallium_v3d or with_broadcom_vk
+diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
+index eea5d3f050e..5cceb1a82cc 100644
+--- a/src/broadcom/simulator/v3d_simulator.c
++++ b/src/broadcom/simulator/v3d_simulator.c
+@@ -490,10 +490,20 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
+ 
+         v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
+ 
+-        if (sim_state.ver >= 41)
+-                v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+-        else
+-                v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
++        switch(sim_state.ver) {
++        case 33:
++           v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
++           break;
++        case 41:
++        case 42:
++           v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
++           break;
++        case 71:
++           v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
++           break;
++        default:
++           unreachable("Unsupported V3D version\n");
++        }
+ 
+         util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
+                               sim_bo) {
+@@ -635,10 +645,17 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+ static int
+ v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
+ {
+-        if (sim_state.ver >= 41)
+-                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
+-        else
++        switch(sim_state.ver) {
++        case 33:
+                 return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
++        case 41:
++        case 42:
++                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
++        case 71:
++                return v3d71_simulator_get_param_ioctl(sim_state.v3d, args);
++        default:
++                unreachable("Unsupported V3D version\n");
++        }
+ }
+ 
+ static int
+@@ -652,10 +669,20 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
+         v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
+         v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
+ 
+-        if (sim_state.ver >= 41)
+-                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+-        else
++        switch(sim_state.ver) {
++        case 33:
+                 ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
++                break;
++        case 41:
++        case 42:
++                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
++                break;
++        case 71:
++                ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args);
++                break;
++        default:
++                unreachable("Unsupported V3D version\n");
++        }
+ 
+         v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
+ 
+@@ -682,11 +709,19 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
+ 
+         v3d_simulator_perfmon_switch(fd, args->perfmon_id);
+ 
+-        if (sim_state.ver >= 41)
+-                ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
+-                                                       file->gmp->ofs);
+-        else
+-                ret = -1;
++        switch(sim_state.ver) {
++        case 41:
++        case 42:
++           ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
++                                                  file->gmp->ofs);
++           break;
++        case 71:
++           ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args,
++                                                  file->gmp->ofs);
++           break;
++        default:
++           ret = -1;
++        }
+ 
+         for (int i = 0; i < args->bo_handle_count; i++)
+                 v3d_simulator_copy_out_handle(file, bo_handles[i]);
+@@ -880,10 +915,20 @@ v3d_simulator_init_global()
+ 
+         util_dynarray_init(&sim_state.bin_oom, NULL);
+ 
+-        if (sim_state.ver >= 41)
+-                v3d41_simulator_init_regs(sim_state.v3d);
+-        else
++        switch(sim_state.ver) {
++        case 33:
+                 v3d33_simulator_init_regs(sim_state.v3d);
++                break;
++        case 41:
++        case 42:
++                v3d41_simulator_init_regs(sim_state.v3d);
++                break;
++        case 71:
++                v3d71_simulator_init_regs(sim_state.v3d);
++                break;
++        default:
++                unreachable("Not supported V3D version\n");
++        }
+ }
+ 
+ struct v3d_simulator_file *
+diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
+index ddb079c1455..1472c313a03 100644
+--- a/src/broadcom/simulator/v3d_simulator.h
++++ b/src/broadcom/simulator/v3d_simulator.h
+@@ -52,6 +52,11 @@ uint32_t v3d_simulator_get_mem_free(void);
+ #  define v3dX(x) v3d41_##x
+ #  include "v3dx_simulator.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dx_simulator.h"
++#  undef v3dX
++
+ #endif
+ 
+ #endif
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index c9322f0397b..723796b16c9 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -46,11 +46,15 @@
+ 
+ #define HW_REGISTER_RO(x) (x)
+ #define HW_REGISTER_RW(x) (x)
+-#if V3D_VERSION >= 41
++#if V3D_VERSION == 71
++#include "libs/core/v3d/registers/7.1.5.1/v3d.h"
++#else
++#if V3D_VERSION == 41 || V3D_VERSION == 42
+ #include "libs/core/v3d/registers/4.1.35.0/v3d.h"
+ #else
+ #include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+ #endif
++#endif
+ 
+ #define V3D_WRITE(reg, val) v3d_hw_write_reg(v3d, reg, val)
+ #define V3D_READ(reg) v3d_hw_read_reg(v3d, reg)
+@@ -310,16 +314,17 @@ v3d_isr_core(struct v3d_hw *v3d,
+                 return;
+         }
+ 
++#if V3D_VERSION <= 42
+         if (core_status & V3D_CTL_0_INT_STS_INT_GMPV_SET) {
+                 fprintf(stderr, "GMP violation at 0x%08x\n",
+                         V3D_READ(V3D_GMP_VIO_ADDR));
+-                abort();
+         } else {
+                 fprintf(stderr,
+                         "Unexpected ISR with core status 0x%08x\n",
+                         core_status);
+         }
+         abort();
++#endif
+ }
+ 
+ static void
+@@ -396,6 +401,18 @@ v3d_isr_hub(struct v3d_hw *v3d)
+         }
+ 
+         handle_mmu_interruptions(v3d, hub_status);
++
++#if V3D_VERSION == 71
++        if (hub_status & V3D_HUB_CTL_INT_STS_INT_GMPV_SET) {
++                fprintf(stderr, "GMP violation at 0x%08x\n",
++                        V3D_READ(V3D_GMP_VIO_ADDR));
++        } else {
++                fprintf(stderr,
++                        "Unexpected ISR with status 0x%08x\n",
++                        hub_status);
++        }
++        abort();
++#endif
+ }
+ 
+ static void
+@@ -436,8 +453,11 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
+          * for tracing. Perhaps we should evaluate to do the same here and add
+          * some debug options.
+          */
+-        uint32_t core_interrupts = (V3D_CTL_0_INT_STS_INT_GMPV_SET |
+-                                    V3D_CTL_0_INT_STS_INT_OUTOMEM_SET);
++        uint32_t core_interrupts = V3D_CTL_0_INT_STS_INT_OUTOMEM_SET;
++#if V3D_VERSION <= 42
++        core_interrupts |= V3D_CTL_0_INT_STS_INT_GMPV_SET;
++#endif
++
+         V3D_WRITE(V3D_CTL_0_INT_MSK_SET, ~core_interrupts);
+         V3D_WRITE(V3D_CTL_0_INT_MSK_CLR, core_interrupts);
+ 
+@@ -447,6 +467,9 @@ v3dX(simulator_init_regs)(struct v3d_hw *v3d)
+             V3D_HUB_CTL_INT_STS_INT_MMU_CAP_SET |  /* CAP exceeded */
+             V3D_HUB_CTL_INT_STS_INT_TFUC_SET); /* TFU conversion */
+ 
++#if V3D_VERSION == 71
++        hub_interrupts |= V3D_HUB_CTL_INT_STS_INT_GMPV_SET;
++#endif
+         V3D_WRITE(V3D_HUB_CTL_INT_MSK_SET, ~hub_interrupts);
+         V3D_WRITE(V3D_HUB_CTL_INT_MSK_CLR, hub_interrupts);
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch
new file mode 100644
index 0000000000..5224359446
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0002-broadcom-simulator-reset-CFG7-for-compute-dispatch-i.patch
@@ -0,0 +1,30 @@
+From 9e85edd1b347b0e779b393f463f42044a720bcff Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 13:16:49 +0200
+Subject: [PATCH 002/142] broadcom/simulator: reset CFG7 for compute dispatch
+ in v71
+
+This register is new in 7.x, it doesn't seem that we need to
+do anything specific for now, but let's make sure it is reset
+every time.
+---
+ src/broadcom/simulator/v3dx_simulator.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index 723796b16c9..f23b0538de3 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -227,6 +227,9 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG4, args->cfg[4]);
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG5, args->cfg[5]);
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG6, args->cfg[6]);
++#if V3D_VERSION >= 71
++        V3D_WRITE(V3D_CSD_0_QUEUED_CFG7, 0);
++#endif
+         /* CFG0 kicks off the job */
+         V3D_WRITE(V3D_CSD_0_QUEUED_CFG0, args->cfg[0]);
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch b/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch
new file mode 100644
index 0000000000..80190c0aef
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0003-broadcom-cle-update-the-packet-definitions-for-new-g.patch
@@ -0,0 +1,712 @@
+From 6f744bc4bec98f9769486d427e8e2d4e314ae056 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 29 Jun 2021 12:03:24 +0200
+Subject: [PATCH 003/142] broadcom/cle: update the packet definitions for new
+ generation v71
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Using as reference the spec for 7.1.5. This include totally new
+packets, and redefine some that already existed on v42.
+
+Full list:
+ * Add Depth Bounds Test Limits
+ * Redefine Tile Binning Mode Cfg
+ * Redefine Cfg Bits. There are some changes on the fields:
+   * Line Rasterization is now 1 bit size
+   * Depth Bounds Enable (that takes one of the bits of Line Rasterization)
+   * Early-Z/Early-Z updates enable bits (16-17) figure now as reserved.
+   * New Z-Clipping mode field
+ * Redefine Tile Rendering Mode Cfg (Common). Changes with respect to v42:
+   * New log2 tile height/width fields starting at bit 52/55
+   * Due those two news, end pad is smaller
+   * sub-id has now a size of 3. Bit 4 is reserved.
+   * Number of render targets: this field max value is now 7 (not
+     reflected on the xml).
+   * Maximum BPP is removed on v71 (now bits 40-41 are reserved)
+   * Depth Buffer disable: on bit 44
+ * Update Store Tile Buffer General
+ * Adding Cfg Render Target Part1/2/3 packets: they replace v4X "Tile
+   Rendering Mode Cfg (Color)" (real name "Rendering Configuration
+   (Render Targets Config)"), "Tile Rendering Mode Cfg (Clear Colors
+   Part1)", "Tile Rendering Mode Cfg (Clear Colors Part2)", and "Tile
+   Rendering Mode Cfg (Clear Colors Part3)". On those old versions,
+   the first packet is used to configure 4 render targets. Now that 8
+   are supported, invididual per-render-target are used.
+ * Update ZS clear values packet.
+ * Add new v71 output formats
+ * Define Clear Render Targets (Replaces Clear Tile Buffers from v42)
+ * Redefine GL Shader State Record. Changes copared with v42:
+   * Fields removed:
+     * "Coordinate shader has separate input and output VPM blocks"
+       (reserved bit now)
+     * "Vertex shader has separate input and output VPM blocks"
+       (reserved bit now)
+     * "Address of table of default attribute Values." (we needed to
+       change the start position for all the following fields)
+   * New field:
+     * "Never defer FEP depth writes to fragment shader auto Z writes
+        on scoreboard conflict"
+ * Redefine clipper xy scaling: Now it uses 1/64ths of pixels, instead
+   of 1/256ths
+ * Update texture shader state.
+   * Notice we don't use an address type for these fields in the XML
+     description. This is because the addresses are 64-bit aligned
+     (even though the PRM doesn't say it) which means the 6 LSB bits
+     are implicitly 0, but the fields are encoded before the 6th bit
+     of their starting byte, so we can't use the usual trick we do
+     with address types where the first 6 bits in the byte are
+     implicitly overwritten by other fields and we have to encode this
+     manually as a uint field. This would mean that if we had an
+     actual BO we would also need to add it manually to the job's
+     list, but since we don't have one, we don't have to do anything
+     about it.
+   * Add new RB_Swap field for texture shader state
+   * Document Cb/Cr addresses as uint fields in texture shader state
+ * Fixup Blend Config description: we now support 8 RTs.
+ * TMU config parameter 2 has new fields
+ * Add new clipper Z without guardband packet in v71
+ * Add enums for the Z clip modes accepted in v71
+ * Fix texture state array stride packing for V3D 7.1.5
+
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+
+broadcom/cle: rb_swap
+---
+ src/broadcom/cle/v3d_packet_v33.xml | 386 ++++++++++++++++++++++++++--
+ 1 file changed, 368 insertions(+), 18 deletions(-)
+
+diff --git a/src/broadcom/cle/v3d_packet_v33.xml b/src/broadcom/cle/v3d_packet_v33.xml
+index a0242b5f1c2..624353ca2bf 100644
+--- a/src/broadcom/cle/v3d_packet_v33.xml
++++ b/src/broadcom/cle/v3d_packet_v33.xml
+@@ -1,4 +1,4 @@
+-<vcxml gen="3.3" min_ver="33" max_ver="42">
++<vcxml gen="3.3" min_ver="33" max_ver="71">
+ 
+   <enum name="Compare Function" prefix="V3D_COMPARE_FUNC">
+     <value name="NEVER" value="0"/>
+@@ -167,13 +167,36 @@
+     <value name="depth_16" value="2"/>
+   </enum>
+ 
+-  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41">
++  <enum name="Render Target Clamp" prefix="V3D_RENDER_TARGET_CLAMP" min_ver="41" max_ver="42">
+     <value name="none" value="0"/> <!-- no clamping -->
+     <value name="norm" value="1"/> <!-- [0,1] for f16 -->
+     <value name="pos" value="2"/> <!-- [0, for f16 -->
+     <value name="int" value="3" min_ver="42"/> <!-- clamp to integer RT's range -->
+   </enum>
+ 
++  <enum name="Render Target Type Clamp" prefix="V3D_RENDER_TARGET_TYPE_CLAMP" min_ver="71">
++    <value name="8i"             value="0"/>  <!-- no clamping -->
++    <value name="16i"            value="1"/>  <!-- no clamping -->
++    <value name="32i"            value="2"/>  <!-- no clamping -->
++    <value name="8ui"            value="4"/>  <!-- no clamping -->
++    <value name="16ui"           value="5"/>  <!-- no clamping -->
++    <value name="32ui"           value="6"/>  <!-- no clamping -->
++    <value name="8"              value="8"/>  <!-- no clamping -->
++    <value name="16f"            value="9"/>  <!-- no clamping -->
++    <value name="32f"            value="10"/> <!-- no clamping -->
++    <value name="8i_clamped"     value="16"/> <!-- clamp to integer RT's range -->
++    <value name="16i_clamped"    value="17"/> <!-- clamp to integer RT's range -->
++    <value name="32i_clamped"    value="18"/> <!-- clamp to integer RT's range -->
++    <value name="8ui_clamped"    value="20"/> <!-- clamp to integer RT's range -->
++    <value name="16ui_clamped"   value="21"/> <!-- clamp to integer RT's range -->
++    <value name="32ui_clamped"   value="22"/> <!-- clamp to integer RT's range -->
++    <value name="16f_clamp_norm" value="24"/> <!-- [0,1] for f16 -->
++    <value name="16f_clamp_pos"  value="25"/> <!-- [0, for f16 -->
++    <value name="16f_clamp_pq"   value="26"/> <!-- PQ lin range, colour to [0, 125], alpha to [0, 1] for f16 -->
++    <value name="16f_clamp_hlg"  value="27"/> <!-- HLG lin range, colour to [0, 12], alpha to [0, 1] for f16 -->
++    <value name="invalid"        value="32"/>
++  </enum>
++
+   <!---
+     CL cache flush commands are not fully documented and subject to a
+     number of hardware issues that make them unreliable. Specifically:
+@@ -263,13 +286,27 @@
+     <value name="r8ui"     value="36"/>
+     <value name="srgbx8"   value="37" max_ver="33"/>
+     <value name="rgbx8"    value="38" max_ver="33"/>
+-    <value name="bstc"     value="39" min_ver="41"/>
++    <value name="bstc8"    value="39" min_ver="41"/>
+     <value name="d32f"     value="40" min_ver="41"/>
+     <value name="d24"      value="41" min_ver="41"/>
+     <value name="d16"      value="42" min_ver="41"/>
+     <value name="d24s8"    value="43" min_ver="41"/>
+     <value name="s8"       value="44" min_ver="41"/>
+     <value name="rgba5551" value="45" min_ver="41"/>
++    <value name="bstc8_srgb"          value="46" min_ver="71"/>
++    <value name="bstc10"              value="47" min_ver="71"/>
++    <value name="bstc10_srgb"         value="48" min_ver="71"/>
++    <value name="bstc10_pq"           value="49" min_ver="71"/>
++    <value name="rgba10x6"            value="50" min_ver="71"/>
++    <value name="bstc10_hlg"          value="55" min_ver="71"/>
++    <value name="rgba10x6_hlg"        value="56" min_ver="71"/>
++    <value name="rgb10_a2_hlg"        value="57" min_ver="71"/>
++    <value name="bstc10_pq_bt1886"    value="58" min_ver="71"/>
++    <value name="rgba10x6_pq_bt1886"  value="59" min_ver="71"/>
++    <value name="rgb10_a2_pq_bt1886"  value="60" min_ver="71"/>
++    <value name="bstc10_hlg_bt1886"   value="61" min_ver="71"/>
++    <value name="rgba10x6_hlg_bt1886" value="62" min_ver="71"/>
++    <value name="rgb10_a2_hlg_bt1886" value="63" min_ver="71"/>
+   </enum>
+ 
+   <enum name="Z/S Output Image Format" prefix="V3D_OUTPUT_IMAGE_FORMAT_ZS" max_ver="33">
+@@ -314,6 +351,12 @@
+     <value name="perp end caps" value="1"/>
+   </enum>
+ 
++  <enum name="Z Clip Mode" prefix="V3D_Z_CLIP_MODE">
++    <value name="NONE" value="0"/>
++    <value name="MIN_ONE_TO_ONE" value="1"/>
++    <value name="ZERO_TO_ONE" value="2"/>
++  </enum>
++
+   <packet code="0" name="Halt"/>
+   <packet code="1" name="NOP"/>
+   <packet code="4" name="Flush"/>
+@@ -381,11 +424,13 @@
+     <field name="Last Tile of Frame" size="1" start="0" type="bool"/>
+   </packet>
+ 
+-  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41">
++  <packet code="25" shortname="clear" name="Clear Tile Buffers" cl="R" min_ver="41" max_ver="42">
+     <field name="Clear Z/Stencil Buffer" size="1" start="1" type="bool"/>
+     <field name="Clear all Render Targets" size="1" start="0" type="bool"/>
+   </packet>
+ 
++  <packet code="25" shortname="clear_rt" name="Clear Render Targets" cl="R" min_ver="71"/>
++
+   <packet code="26" shortname="load" name="Reload Tile Color Buffer" cl="R" max_ver="33">
+     <field name="Disable Color Buffer load" size="8" start="8" type="uint"/>
+     <field name="Enable Z load" size="1" start="7" type="bool"/>
+@@ -443,6 +488,10 @@
+       <value name="Render target 1" value="1"/>
+       <value name="Render target 2" value="2"/>
+       <value name="Render target 3" value="3"/>
++      <value name="Render target 4" value="4" min_ver="71"/>
++      <value name="Render target 5" value="5" min_ver="71"/>
++      <value name="Render target 6" value="6" min_ver="71"/>
++      <value name="Render target 7" value="7" min_ver="71"/>
+       <value name="None" value="8"/>
+       <value name="Z" value="9"/>
+       <value name="Stencil" value="10"/>
+@@ -789,7 +838,7 @@
+     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+   </packet>
+ 
+-  <packet code="84" name="Blend Cfg" min_ver="41">
++  <packet code="84" name="Blend Cfg" min_ver="41" max_ver="42">
+     <field name="Render Target Mask" size="4" start="24" type="uint"/>
+     <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
+     <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
+@@ -799,6 +848,16 @@
+     <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
+   </packet>
+ 
++  <packet code="84" name="Blend Cfg" min_ver="71">
++    <field name="Render Target Mask" size="8" start="24" type="uint"/>
++    <field name="Color blend dst factor" size="4" start="20" type="Blend Factor"/>
++    <field name="Color blend src factor" size="4" start="16" type="Blend Factor"/>
++    <field name="Color blend mode" size="4" start="12" type="Blend Mode"/>
++    <field name="Alpha blend dst factor" size="4" start="8" type="Blend Factor"/>
++    <field name="Alpha blend src factor" size="4" start="4" type="Blend Factor"/>
++    <field name="Alpha blend mode" size="4" start="0" type="Blend Mode"/>
++  </packet>
++
+   <packet code="86" shortname="blend_ccolor" name="Blend Constant Color">
+     <field name="Alpha (F16)" size="16" start="48" type="uint"/>
+     <field name="Blue (F16)" size="16" start="32" type="uint"/>
+@@ -828,7 +887,12 @@
+     <field name="address" size="32" start="0" type="address"/>
+   </packet>
+ 
+-  <packet code="96" name="Cfg Bits">
++  <packet code="93" name="Depth Bounds Test Limits" min_ver="71">
++    <field name="Lower Test Limit" size="32" start="0" type="float"/>
++    <field name="Upper Test Limit" size="32" start="32" type="float"/>
++  </packet>
++
++  <packet code="96" name="Cfg Bits" max_ver="42">
+     <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
+     <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
+     <field name="Blend enable" size="1" start="19" type="bool"/>
+@@ -846,6 +910,25 @@
+     <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
+   </packet>
+ 
++  <packet code="96" name="Cfg Bits" min_ver="71">
++    <field name="Z Clipping mode" size="2" start="22" type="Z Clip Mode"/>
++    <field name="Direct3D Provoking Vertex" size="1" start="21" type="bool"/>
++    <field name="Direct3D 'Point-fill' mode" size="1" start="20" type="bool"/>
++    <field name="Blend enable" size="1" start="19" type="bool"/>
++    <field name="Stencil enable" size="1" start="18" type="bool"/>
++    <field name="Z updates enable" size="1" start="15" type="bool"/>
++    <field name="Depth-Test Function" size="3" start="12" type="Compare Function"/>
++    <field name="Direct3D Wireframe triangles mode" size="1" start="11" type="bool"/>
++    <field name="Z Clamp Mode" size="1" start="10" type="bool"/>
++    <field name="Rasterizer Oversample Mode" size="2" start="6" type="uint"/>
++    <field name="Depth Bounds Test Enable" size="1" start="5" type="bool"/>
++    <field name="Line Rasterization" size="1" start="4" type="uint"/>
++    <field name="Enable Depth Offset" size="1" start="3" type="bool"/>
++    <field name="Clockwise Primitives" size="1" start="2" type="bool"/>
++    <field name="Enable Reverse Facing Primitive" size="1" start="1" type="bool"/>
++    <field name="Enable Forward Facing Primitive" size="1" start="0" type="bool"/>
++  </packet>
++
+   <packet code="97" shortname="zero_all_flatshade_flags" name="Zero All Flat Shade Flags"/>
+ 
+   <packet code="98" shortname="flatshade_flags" name="Flat Shade Flags">
+@@ -907,16 +990,26 @@
+     <field name="Minimum Zw" size="32" start="0" type="float"/>
+   </packet>
+ 
+-  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B">
++  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" max_ver="42">
+     <field name="Viewport Half-Height in 1/256th of pixel" size="32" start="32" type="float"/>
+     <field name="Viewport Half-Width in 1/256th of pixel" size="32" start="0" type="float"/>
+   </packet>
+ 
++  <packet shortname="clipper_xy" name="Clipper XY Scaling" code="110" cl="B" min_ver="71">
++    <field name="Viewport Half-Height in 1/64th of pixel" size="32" start="32" type="float"/>
++    <field name="Viewport Half-Width in 1/64th of pixel" size="32" start="0" type="float"/>
++  </packet>
++
+   <packet shortname="clipper_z" name="Clipper Z Scale and Offset" code="111" cl="B">
+     <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
+     <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
+   </packet>
+ 
++  <packet shortname="clipper_z_no_guardband" name="Clipper Z Scale and Offset no guardband" code="112" cl="B" min_ver="71">
++    <field name="Viewport Z Offset (Zc to Zs)" size="32" start="32" type="float"/>
++    <field name="Viewport Z Scale (Zc to Zs)" size="32" start="0" type="float"/>
++  </packet>
++
+   <packet name="Number of Layers" code="119" min_ver="41">
+     <field name="Number of Layers" size="8" start="0" type="uint" minus_one="true"/>
+   </packet>
+@@ -947,7 +1040,7 @@
+     <field name="sub-id" size="1" start="0" type="uint" default="0"/>
+   </packet>
+ 
+-  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41">
++  <packet code="120" name="Tile Binning Mode Cfg" min_ver="41" max_ver="42">
+ 
+     <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
+     <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
+@@ -971,6 +1064,35 @@
+     </field>
+   </packet>
+ 
++  <packet code="120" name="Tile Binning Mode Cfg" min_ver="71">
++    <field name="Height (in pixels)" size="16" start="48" type="uint" minus_one="true"/>
++    <field name="Width (in pixels)" size="16" start="32" type="uint" minus_one="true"/>
++
++    <field name="Log2 Tile Height" size="3" start="11" type="uint">
++      <value name="tile height 8 pixels" value="0"/>
++      <value name="tile height 16 pixels" value="1"/>
++      <value name="tile height 32 pixels" value="2"/>
++      <value name="tile height 64 pixels" value="3"/>
++    </field>
++    <field name="Log2 Tile Width"  size="3" start="8" type="uint">
++      <value name="tile width 8 pixels" value="0"/>
++      <value name="tile width 16 pixels" value="1"/>
++      <value name="tile width 32 pixels" value="2"/>
++      <value name="tile width 64 pixels" value="3"/>
++    </field>
++
++    <field name="tile allocation block size" size="2" start="4" type="uint">
++      <value name="tile allocation block size 64b" value="0"/>
++      <value name="tile allocation block size 128b" value="1"/>
++      <value name="tile allocation block size 256b" value="2"/>
++    </field>
++    <field name="tile allocation initial block size" size="2" start="2" type="uint">
++      <value name="tile allocation initial block size 64b" value="0"/>
++      <value name="tile allocation initial block size 128b" value="1"/>
++      <value name="tile allocation initial block size 256b" value="2"/>
++    </field>
++  </packet>
++
+   <packet code="120" name="Tile Binning Mode Cfg (Part2)" cl="B" max_ver="33">
+     <field name="Tile Allocation Memory Address" size="32" start="32" type="address"/>
+     <field name="Tile Allocation Memory Size" size="32" start="0" type="uint"/>
+@@ -1002,7 +1124,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="41" max_ver="42">
+     <field name="Pad" size="12" start="52" type="uint"/>
+ 
+     <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
+@@ -1018,7 +1140,11 @@
+     <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
+     <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
+ 
+-    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP"/>
++    <field name="Maximum BPP of all render targets" size="2" start="40" type="Internal BPP">
++      <value name="Render target maximum 32bpp" value="0"/>
++      <value name="Render target maximum 64bpp" value="1"/>
++      <value name="Render target maximum 128bpp" value="2"/>
++    </field>
+ 
+     <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
+     <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
+@@ -1027,6 +1153,43 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="0"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Common)" cl="R" min_ver="71">
++    <field name="Pad" size="6" start="58" type="uint"/>
++
++    <field name="Log2 Tile Height" size="3" start="55" type="uint">
++      <value name="tile height 8 pixels" value="0"/>
++      <value name="tile height 16 pixels" value="1"/>
++      <value name="tile height 32 pixels" value="2"/>
++      <value name="tile height 64 pixels" value="3"/>
++    </field>
++    <field name="Log2 Tile Width"  size="3" start="52" type="uint">
++      <value name="tile width 8 pixels" value="0"/>
++      <value name="tile width 16 pixels" value="1"/>
++      <value name="tile width 32 pixels" value="2"/>
++      <value name="tile width 64 pixels" value="3"/>
++    </field>
++
++    <field name="Early Depth/Stencil Clear" size="1" start="51" type="bool"/>
++    <field name="Internal Depth Type" size="4" start="47" type="Internal Depth Type"/>
++
++    <field name="Early-Z disable" size="1" start="46" type="bool"/>
++
++    <field name="Early-Z Test and Update Direction" size="1" start="45" type="uint">
++      <value name="Early-Z direction LT/LE" value="0"/>
++      <value name="Early-Z direction GT/GE" value="1"/>
++    </field>
++
++    <field name="Depth-buffer disable" size="1" start="44" type="bool"/>
++    <field name="Double-buffer in non-ms mode" size="1" start="43" type="bool"/>
++    <field name="Multisample Mode (4x)" size="1" start="42" type="bool"/>
++
++    <field name="Image Height (pixels)" size="16" start="24" type="uint"/>
++    <field name="Image Width (pixels)" size="16" start="8" type="uint"/>
++    <field name="Number of Render Targets" size="4" start="4" type="uint" minus_one="true"/>
++
++    <field name="sub-id" size="3" start="0" type="uint" default="0"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" max_ver="33">
+     <field name="Address" size="32" start="32" type="address"/>
+ 
+@@ -1048,7 +1211,8 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41">
++  <!-- On 4.1 the real name would be "Tile Rendering Mode Cfg (Render Target Configs) -->
++  <packet code="121" name="Tile Rendering Mode Cfg (Color)" cl="R" min_ver="41" max_ver="42">
+ 
+     <field name="Pad" size="28" start="36" type="uint"/>
+ 
+@@ -1099,7 +1263,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="41" max_ver="42">
+     <field name="unused" size="16" start="48" type="uint"/>
+ 
+     <field name="Z Clear Value" size="32" start="16" type="float"/>
+@@ -1108,6 +1272,15 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="2"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (ZS Clear Values)" cl="R" min_ver="71">
++    <field name="unused" size="16" start="48" type="uint"/>
++
++    <field name="Z Clear Value" size="32" start="16" type="float"/>
++
++    <field name="Stencil Clear Value" size="8" start="8" type="uint"/>
++    <field name="sub-id" size="4" start="0" type="uint" default="1"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" max_ver="33">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
+@@ -1117,7 +1290,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part1)" cl="R" min_ver="41" max_ver="42">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color next 24 bits" size="24" start="40" type="uint"/>
+     <field name="Clear Color low 32 bits" size="32" start="8" type="uint"/>
+@@ -1126,6 +1299,19 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="3"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part1)" cl="R" min_ver="71">
++
++    <field name="Clear Color low bits" size="32" start="32" type="uint"/>
++    <field name="Internal Type and Clamping" size="5" start="27" type="Render Target Type Clamp"/>
++    <field name="Internal BPP" size="2" start="25" type="Internal BPP"/>
++
++    <field name="Stride" size="7" start="18" type="uint" minus_one="true"/>
++    <!-- In multiples of 512 bits -->
++    <field name="Base Address" size="11" start="7" type="uint"/>
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="2"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" max_ver="33">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
+@@ -1135,7 +1321,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part2)" cl="R" min_ver="41" max_ver="42">
+     <!-- Express this as a 56-bit field? -->
+     <field name="Clear Color mid-high 24 bits" size="24" start="40" type="uint"/>
+     <field name="Clear Color mid-low 32 bits" size="32" start="8" type="uint"/>
+@@ -1144,6 +1330,13 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="4"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part2)" cl="R" min_ver="71">
++    <field name="Clear Color mid bits" size="40" start="24" type="uint"/>
++
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="3"/>
++  </packet>
++
+   <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" max_ver="33">
+     <field name="pad" size="11" start="53" type="uint"/>
+     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
+@@ -1155,7 +1348,7 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="6"/>
+   </packet>
+ 
+-  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41">
++  <packet code="121" name="Tile Rendering Mode Cfg (Clear Colors Part3)" cl="R" min_ver="41" max_ver="42">
+     <field name="pad" size="11" start="53" type="uint"/>
+     <field name="UIF padded height in UIF blocks" size="13" start="40" type="uint"/>
+     <!-- image height is for Y flipping -->
+@@ -1166,6 +1359,13 @@
+     <field name="sub-id" size="4" start="0" type="uint" default="5"/>
+   </packet>
+ 
++  <packet code="121" name="Tile Rendering Mode Cfg (Render Target Part3)" cl="R" min_ver="71">
++    <field name="Clear Color top bits" size="56" start="8" type="uint"/>
++
++    <field name="Render Target number" size="3" start="3" type="uint"/>
++    <field name="sub-id" size="3" start="0" type="uint" default="4"/>
++  </packet>
++
+   <packet code="124" shortname="tile_coords" name="Tile Coordinates">
+     <field name="tile row number" size="12" start="12" type="uint"/>
+     <field name="tile column number" size="12" start="0" type="uint"/>
+@@ -1240,7 +1440,7 @@
+     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
+   </struct>
+ 
+-  <struct name="GL Shader State Record" min_ver="41">
++  <struct name="GL Shader State Record" min_ver="41" max_ver="42">
+     <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
+     <field name="Enable clipping" size="1" start="1" type="bool"/>
+ 
+@@ -1299,6 +1499,63 @@
+     <field name="Coordinate Shader Uniforms Address" size="32" start="32b" type="address"/>
+   </struct>
+ 
++  <struct name="GL Shader State Record" min_ver="71">
++    <field name="Point size in shaded vertex data" size="1" start="0" type="bool"/>
++    <field name="Enable clipping" size="1" start="1" type="bool"/>
++
++    <field name="Vertex ID read by coordinate shader" size="1" start="2" type="bool"/>
++    <field name="Instance ID read by coordinate shader" size="1" start="3" type="bool"/>
++    <field name="Base Instance ID read by coordinate shader" size="1" start="4" type="bool"/>
++    <field name="Vertex ID read by vertex shader" size="1" start="5" type="bool"/>
++    <field name="Instance ID read by vertex shader" size="1" start="6" type="bool"/>
++    <field name="Base Instance ID read by vertex shader" size="1" start="7" type="bool"/>
++
++    <field name="Fragment shader does Z writes" size="1" start="8" type="bool"/>
++    <field name="Turn off early-z test" size="1" start="9" type="bool"/>
++
++    <field name="Fragment shader uses real pixel centre W in addition to centroid W2" size="1" start="12" type="bool"/>
++    <field name="Enable Sample Rate Shading" size="1" start="13" type="bool"/>
++    <field name="Any shader reads hardware-written Primitive ID" size="1" start="14" type="bool"/>
++    <field name="Insert Primitive ID as first varying to fragment shader" size="1" start="15" type="bool"/>
++    <field name="Turn off scoreboard" size="1" start="16" type="bool"/>
++    <field name="Do scoreboard wait on first thread switch" size="1" start="17" type="bool"/>
++    <field name="Disable implicit point/line varyings" size="1" start="18" type="bool"/>
++    <field name="No prim pack" size="1" start="19" type="bool"/>
++    <field name="Never defer FEP depth writes" size="1" start="20" type="bool"/>
++
++    <field name="Number of varyings in Fragment Shader" size="8" start="3b" type="uint"/>
++
++    <field name="Coordinate Shader output VPM segment size" size="4" start="4b" type="uint"/>
++    <field name="Min Coord Shader output segments required in play in addition to VCM cache size" size="4" start="36" type="uint"/>
++
++    <field name="Coordinate Shader input VPM segment size" size="4" start="5b" type="uint"/>
++    <field name="Min Coord Shader input segments required in play" size="4" start="44" type="uint" minus_one="true"/>
++
++    <field name="Vertex Shader output VPM segment size" size="4" start="6b" type="uint"/>
++    <field name="Min Vertex Shader output segments required in play in addition to VCM cache size" size="4" start="52" type="uint"/>
++
++    <field name="Vertex Shader input VPM segment size" size="4" start="7b" type="uint"/>
++    <field name="Min Vertex Shader input segments required in play" size="4" start="60" type="uint" minus_one="true"/>
++
++    <field name="Fragment Shader Code Address" size="29" start="67" type="address"/>
++    <field name="Fragment Shader 4-way threadable" size="1" start="64" type="bool"/>
++    <field name="Fragment Shader start in final thread section" size="1" start="65" type="bool"/>
++    <field name="Fragment Shader Propagate NaNs" size="1" start="66" type="bool"/>
++    <field name="Fragment Shader Uniforms Address" size="32" start="12b" type="address"/>
++
++    <field name="Vertex Shader Code Address" size="29" start="131" type="address"/>
++    <field name="Vertex Shader 4-way threadable" size="1" start="128" type="bool"/>
++    <field name="Vertex Shader start in final thread section" size="1" start="129" type="bool"/>
++    <field name="Vertex Shader Propagate NaNs" size="1" start="130" type="bool"/>
++    <field name="Vertex Shader Uniforms Address" size="32" start="20b" type="address"/>
++
++    <field name="Coordinate Shader Code Address" size="29" start="195" type="address"/>
++    <field name="Coordinate Shader 4-way threadable" size="1" start="192" type="bool"/>
++    <field name="Coordinate Shader start in final thread section" size="1" start="193" type="bool"/>
++    <field name="Coordinate Shader Propagate NaNs" size="1" start="194" type="bool"/>
++    <field name="Coordinate Shader Uniforms Address" size="32" start="28b" type="address"/>
++  </struct>
++
+   <struct name="Geometry Shader State Record" min_ver="41">
+     <field name="Geometry Bin Mode Shader Code Address" size="29" start="3" type="address"/>
+     <field name="Geometry Bin Mode Shader 4-way threadable" size="1" start="0" type="bool"/>
+@@ -1543,7 +1800,7 @@
+     <field name="Offset Format 8" size="1" start="0" type="bool"/>
+   </struct>
+ 
+-  <struct name="TMU Config Parameter 2" min_ver="42">
++  <struct name="TMU Config Parameter 2" min_ver="42" max_ver="42">
+     <field name="Pad" size="7" start="25" type="uint"/>
+     <field name="LOD Query" size="1" start="24" type="bool"/>
+     <field name="Op" size="4" start="20" type="TMU Op"/>
+@@ -1558,6 +1815,23 @@
+     <field name="Offset Format 8" size="1" start="0" type="bool"/>
+   </struct>
+ 
++  <struct name="TMU Config Parameter 2" min_ver="71">
++    <field name="Pad" size="5" start="27" type="uint"/>
++    <field name="Write conversion" size="1" start="26" type="bool"/>
++    <field name="DIM query" size="1" start="25" type="bool"/>
++    <field name="LOD Query" size="1" start="24" type="bool"/>
++    <field name="Op" size="4" start="20" type="TMU Op"/>
++    <field name="Offset R" size="4" start="16" type="int"/>
++    <field name="Offset T" size="4" start="12" type="int"/>
++    <field name="Offset S" size="4" start="8" type="int"/>
++    <field name="Gather Mode" size="1" start="7" type="bool"/>
++    <field name="Gather Component" size="2" start="5" type="uint"/>
++    <field name="Coefficient Mode" size="1" start="4" type="bool"/>
++    <field name="Sample Number" size="2" start="2" type="uint"/>
++    <field name="Disable AutoLOD" size="1" start="1" type="bool"/>
++    <field name="Offset Format 8" size="1" start="0" type="bool"/>
++  </struct>
++
+   <struct name="Texture Shader State" max_ver="33">
+     <field name="UIF XOR disable" size="1" start="255" type="bool"/>
+     <field name="Level 0 is strictly UIF" size="1" start="254" type="bool"/>
+@@ -1611,7 +1885,7 @@
+     <field name="Filter" size="4" start="0" type="TMU Filter"/>
+   </struct>
+ 
+-  <struct name="Texture Shader State" min_ver="41">
++  <struct name="Texture Shader State" min_ver="41" max_ver="42">
+     <field name="Pad" size="56" start="136" type="uint"/>
+     <field name="UIF XOR disable" size="1" start="135" type="bool"/>
+     <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
+@@ -1652,6 +1926,82 @@
+     <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
+   </struct>
+ 
++  <struct name="Texture Shader State" min_ver="71">
++    <field name="Pad" size="2" start="190" type="uint"/>
++    <!-- When we use an address type, there is an implicit requirement
++         that the address is a 32-bit that is encoded starting at a 32-bit
++         aligned bit offset into the packet. If the address field has less than
++         32 bits, it is assumed that the address is aligned. For example, a
++         26-bit address field is expected to be 64-byte aligned (6 lsb bits
++         are 0) and that this will be encoded into a packet starting at bit
++         offset 6 into a 32-bit dword (since bits 0..5 of the address are
++         implicitly 0 and don't need to be explicitly encoded).
++
++         Unfortunately, the CB address below doesn't match this requirement:
++         it starts at bit 138, which is 10 bits into a 32-bit dword, but it
++         represents a 64-bit aligned address (6 lsb bits are 0), so we cannot
++         encode it as an address type. To fix this we encode these addresses
++         as uint types which has two implications:
++         1. the driver is responsible for manually addinng the buffer objects
++            for these addresses to the job BO list.
++         2. the driver needs to pass an actual 26-bit address value by manually
++            shifting the 6 lsb bits (that are implicitly 0).
++    -->
++    <field name="texture_base pointer_Cr" size="26" start="164" type="uint"/>
++    <field name="texture base pointer Cb" size="26" start="138" type="uint"/>
++    <field name="Chroma offset y" size="1" start="137" type="uint"/>
++    <field name="Chroma offset x" size="1" start="136" type="uint"/>
++
++    <field name="UIF XOR disable" size="1" start="135" type="bool"/>
++    <field name="Level 0 is strictly UIF" size="1" start="134" type="bool"/>
++    <field name="Level 0 XOR enable" size="1" start="132" type="bool"/>
++    <field name="Level 0 UB_PAD" size="4" start="128" type="uint"/>
++
++    <field name="Base Level" size="4" start="124" type="uint"/>
++    <field name="Max Level" size="4" start="120" type="uint"/>
++
++    <field name="Swizzle A" size="3" start="117" type="uint">
++      <value name="Swizzle Zero" value="0"/>
++      <value name="Swizzle One" value="1"/>
++      <value name="Swizzle Red" value="2"/>
++      <value name="Swizzle Green" value="3"/>
++      <value name="Swizzle Blue" value="4"/>
++      <value name="Swizzle Alpha" value="5"/>
++    </field>
++
++    <field name="Swizzle B" size="3" start="114" type="uint"/>
++    <field name="Swizzle G" size="3" start="111" type="uint"/>
++    <field name="Swizzle R" size="3" start="108" type="uint"/>
++    <field name="Extended" size="1" start="107" type="bool"/>
++
++    <field name="Texture type" size="7" start="100" type="uint"/>
++    <field name="Image Depth" size="14" start="86" type="uint"/>
++    <field name="Image Height" size="14" start="72" type="uint"/>
++    <field name="Image Width" size="14" start="58" type="uint"/>
++
++    <!-- V3D 7.1.2 doesn't have the RB swap bit and has Array Stride starting
++         at bit 32. However, 7.1.5 included the RB swap bit at bit 32 and has
++         Array Stride starting at 33, which is backwards incompatible,
++         We use the definition from 7.1.5.
++    -->
++    <field name="Array Stride (64-byte aligned)" size="24" start="33" type="uint"/>
++    <field name="R/B swap" size="1" start="32" type="bool"/>
++
++    <field name="Texture base pointer" size="32" start="0" type="address"/>
++
++    <field name="Reverse" size="1" start="5" type="bool"/>
++    <field name="Transfer func" size="3" start="2" type="uint">
++      <value name="Transfer Func None" value="0"/>
++      <value name="Transfer Func sRGB" value="1"/>
++      <value name="Transfer Func PQ" value="2"/>
++      <value name="Transfer Func HLG" value="3"/>
++      <value name="Transfer Func PQ BT1886" value="4"/>
++      <value name="Transfer Func HLG BT1886" value="5"/>
++    </field>
++    <field name="Flip texture Y Axis" size="1" start="1" type="bool"/>
++    <field name="Flip texture X Axis" size="1" start="0" type="bool"/>
++  </struct>
++
+   <struct name="Sampler State" min_ver="41">
+     <field name="Border color word 3" size="32" start="160" type="uint"/>
+     <field name="Border color word 2" size="32" start="128" type="uint"/>
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch b/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch
new file mode 100644
index 0000000000..6f2fe867f4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0004-broadcom-common-retrieve-V3D-revision-number.patch
@@ -0,0 +1,65 @@
+From 569cbe4229df737ce5915c4be2cad534707fb4f7 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 9 Nov 2021 08:50:51 +0100
+Subject: [PATCH 004/142] broadcom/common: retrieve V3D revision number
+
+The subrev field from the hub ident3 register is bumped with every
+hardware revision doing backwards incompatible changes so we want to
+keep track of this.
+
+Instead of modifying the 'ver' field info to acommodate subrev info,
+which would require a lot of changes, simply add a new 'rev' field in
+devinfo that we can use when we need to make changes based on the
+revision number of a hardware release.
+---
+ src/broadcom/common/v3d_device_info.c | 14 +++++++++++++-
+ src/broadcom/common/v3d_device_info.h |  3 +++
+ 2 files changed, 16 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
+index 7e0862f1f02..7512fe3a06b 100644
+--- a/src/broadcom/common/v3d_device_info.c
++++ b/src/broadcom/common/v3d_device_info.c
+@@ -36,6 +36,9 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+     struct drm_v3d_get_param ident1 = {
+             .param = DRM_V3D_PARAM_V3D_CORE0_IDENT1,
+     };
++    struct drm_v3d_get_param hub_ident3 = {
++            .param = DRM_V3D_PARAM_V3D_HUB_IDENT3,
++    };
+     int ret;
+ 
+     ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &ident0);
+@@ -76,5 +79,14 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+                 return false;
+     }
+ 
+-    return true;
++    ret = drm_ioctl(fd, DRM_IOCTL_V3D_GET_PARAM, &hub_ident3);
++    if (ret != 0) {
++            fprintf(stderr, "Couldn't get V3D core HUB IDENT3: %s\n",
++                    strerror(errno));
++            return false;
++    }
++
++   devinfo->rev = (hub_ident3.value >> 8) & 0xff;
++
++   return true;
+ }
+diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
+index 97abd9b8d9f..32cb65cf81f 100644
+--- a/src/broadcom/common/v3d_device_info.h
++++ b/src/broadcom/common/v3d_device_info.h
+@@ -34,6 +34,9 @@ struct v3d_device_info {
+         /** Simple V3D version: major * 10 + minor */
+         uint8_t ver;
+ 
++        /** V3D revision number */
++        uint8_t rev;
++
+         /** Size of the VPM, in bytes. */
+         int vpm_size;
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch b/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch
new file mode 100644
index 0000000000..2f07c250d8
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0005-broadcom-common-add-some-common-v71-helpers.patch
@@ -0,0 +1,91 @@
+From c260843c882d25bd31e308566b45d4517fda0fa2 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 17 Nov 2021 14:40:47 +0100
+Subject: [PATCH 005/142] broadcom/common: add some common v71 helpers
+
+---
+ src/broadcom/common/v3d_util.c | 27 +++++++++++++++++++++++++++
+ src/broadcom/common/v3d_util.h | 27 +++++++++++++++++++++++++++
+ 2 files changed, 54 insertions(+)
+
+diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
+index 57872a923d3..26f5c6b336f 100644
+--- a/src/broadcom/common/v3d_util.c
++++ b/src/broadcom/common/v3d_util.c
+@@ -170,3 +170,30 @@ v3d_hw_prim_type(enum mesa_prim prim_type)
+       unreachable("Unsupported primitive type");
+    }
+ }
++
++uint32_t
++v3d_internal_bpp_words(uint32_t internal_bpp)
++{
++        switch (internal_bpp) {
++        case 0 /* V3D_INTERNAL_BPP_32 */:
++                return 1;
++        case 1 /* V3D_INTERNAL_BPP_64 */:
++                return 2;
++        case 2 /* V3D_INTERNAL_BPP_128 */:
++                return 4;
++        default:
++                unreachable("Unsupported internal BPP");
++        }
++}
++
++uint32_t
++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
++                                       uint32_t bpp)
++{
++        /* stride in multiples of 128 bits, and covers 2 rows. This is the
++         * reason we divide by 2 instead of 4, as we divide number of 32-bit
++         * words per row by 2.
++         */
++
++        return (tile_width * bpp) / 2;
++}
+diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
+index eb802b77f67..864fc949ffa 100644
+--- a/src/broadcom/common/v3d_util.h
++++ b/src/broadcom/common/v3d_util.h
+@@ -24,6 +24,7 @@
+ #ifndef V3D_UTIL_H
+ #define V3D_UTIL_H
+ 
++#include "util/macros.h"
+ #include "common/v3d_device_info.h"
+ #include "pipe/p_defines.h"
+ 
+@@ -46,4 +47,30 @@ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+ uint32_t
+ v3d_hw_prim_type(enum mesa_prim prim_type);
+ 
++uint32_t
++v3d_internal_bpp_words(uint32_t internal_bpp);
++
++/* Some configuration packets want the size on log2, but starting at 0 for
++ * size 8.
++ */
++static inline uint8_t
++log2_tile_size(uint32_t size)
++{
++        switch(size) {
++        case 8:
++                return 0;
++        case 16:
++                return 1;
++        case 32:
++                return 2;
++        case 64:
++                return 3;
++        default:
++                unreachable("Unsupported tile width/height");
++        }
++}
++
++uint32_t
++v3d_compute_rt_row_row_stride_128_bits(uint32_t tile_width,
++                                       uint32_t bpp);
+ #endif
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch b/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch
new file mode 100644
index 0000000000..0250d31af5
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0006-broadcom-qpu-add-comments-on-waddr-not-used-on-V3D-7.patch
@@ -0,0 +1,53 @@
+From a5211a4d71acc53183d2a90eb1694d8cce6eb44f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 5 Aug 2021 01:03:11 +0200
+Subject: [PATCH 006/142] broadcom/qpu: add comments on waddr not used on V3D
+ 7.x
+
+---
+ src/broadcom/qpu/qpu_instr.h | 22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 2e133472698..45a0cad9760 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -88,11 +88,11 @@ enum v3d_qpu_uf {
+ };
+ 
+ enum v3d_qpu_waddr {
+-        V3D_QPU_WADDR_R0 = 0,
+-        V3D_QPU_WADDR_R1 = 1,
+-        V3D_QPU_WADDR_R2 = 2,
+-        V3D_QPU_WADDR_R3 = 3,
+-        V3D_QPU_WADDR_R4 = 4,
++        V3D_QPU_WADDR_R0 = 0,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R1 = 1,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R2 = 2,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R3 = 3,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_R4 = 4,    /* Reserved on V3D 7.x */
+         V3D_QPU_WADDR_R5 = 5,
+         V3D_QPU_WADDR_NOP = 6,
+         V3D_QPU_WADDR_TLB = 7,
+@@ -108,12 +108,12 @@ enum v3d_qpu_waddr {
+         V3D_QPU_WADDR_SYNC = 16,
+         V3D_QPU_WADDR_SYNCU = 17,
+         V3D_QPU_WADDR_SYNCB = 18,
+-        V3D_QPU_WADDR_RECIP = 19,
+-        V3D_QPU_WADDR_RSQRT = 20,
+-        V3D_QPU_WADDR_EXP = 21,
+-        V3D_QPU_WADDR_LOG = 22,
+-        V3D_QPU_WADDR_SIN = 23,
+-        V3D_QPU_WADDR_RSQRT2 = 24,
++        V3D_QPU_WADDR_RECIP = 19,  /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_RSQRT = 20,  /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_EXP = 21,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_LOG = 22,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_SIN = 23,    /* Reserved on V3D 7.x */
++        V3D_QPU_WADDR_RSQRT2 = 24, /* Reserved on V3D 7.x */
+         V3D_QPU_WADDR_TMUC = 32,
+         V3D_QPU_WADDR_TMUS = 33,
+         V3D_QPU_WADDR_TMUT = 34,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch b/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch
new file mode 100644
index 0000000000..2a1a7ae248
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0007-broadcom-qpu-set-V3D-7.x-names-for-some-waddr-aliasi.patch
@@ -0,0 +1,60 @@
+From 0ccf3043e4a584e5592bb7fad737d5d98ed23db0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 5 Aug 2021 01:00:47 +0200
+Subject: [PATCH 007/142] broadcom/qpu: set V3D 7.x names for some waddr
+ aliasing
+
+V3D 7.x got rid of the accumulator, but still uses the values for
+WADDR_R5 and WADDR_R5REP, so let's return a proper name and add some
+aliases.
+---
+ src/broadcom/qpu/qpu_instr.c | 8 ++++++++
+ src/broadcom/qpu/qpu_instr.h | 6 ++++--
+ 2 files changed, 12 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 60dabf74e8e..7759fb0efdf 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -35,6 +35,14 @@ v3d_qpu_magic_waddr_name(const struct v3d_device_info *devinfo,
+         if (devinfo->ver < 40 && waddr == V3D_QPU_WADDR_TMU)
+                 return "tmu";
+ 
++        /* V3D 7.x QUAD and REP aliases R5 and R5REPT in the table below
++         */
++        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_QUAD)
++                return "quad";
++
++        if (devinfo->ver >= 71 && waddr == V3D_QPU_WADDR_REP)
++                return "rep";
++
+         static const char *waddr_magic[] = {
+                 [V3D_QPU_WADDR_R0] = "r0",
+                 [V3D_QPU_WADDR_R1] = "r1",
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 45a0cad9760..19bf721dbe1 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -93,7 +93,8 @@ enum v3d_qpu_waddr {
+         V3D_QPU_WADDR_R2 = 2,    /* Reserved on V3D 7.x */
+         V3D_QPU_WADDR_R3 = 3,    /* Reserved on V3D 7.x */
+         V3D_QPU_WADDR_R4 = 4,    /* Reserved on V3D 7.x */
+-        V3D_QPU_WADDR_R5 = 5,
++        V3D_QPU_WADDR_R5 = 5,    /* V3D 4.x */
++        V3D_QPU_WADDR_QUAD = 5,  /* V3D 7.x */
+         V3D_QPU_WADDR_NOP = 6,
+         V3D_QPU_WADDR_TLB = 7,
+         V3D_QPU_WADDR_TLBU = 8,
+@@ -129,7 +130,8 @@ enum v3d_qpu_waddr {
+         V3D_QPU_WADDR_TMUHSCM = 44,
+         V3D_QPU_WADDR_TMUHSF = 45,
+         V3D_QPU_WADDR_TMUHSLOD = 46,
+-        V3D_QPU_WADDR_R5REP = 55,
++        V3D_QPU_WADDR_R5REP = 55, /* V3D 4.x */
++        V3D_QPU_WADDR_REP = 55,   /* V3D 7.x */
+ };
+ 
+ struct v3d_qpu_flags {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch b/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch
new file mode 100644
index 0000000000..96d81a2c1a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0008-broadcom-compiler-rename-small_imm-to-small_imm_b.patch
@@ -0,0 +1,241 @@
+From 18de3cc85cf8bbe294e044f7a12abe14e554de0a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Sun, 19 Sep 2021 03:20:18 +0200
+Subject: [PATCH 008/142] broadcom/compiler: rename small_imm to small_imm_b
+
+Current small_imm is associated with the "B" read address.
+
+We do this change in advance for v71 support, where we will have 4
+different small_imm (a/b/c/d), so we start with a renaming.
+---
+ src/broadcom/compiler/qpu_schedule.c          | 22 +++++++++----------
+ .../compiler/vir_opt_small_immediates.c       |  4 ++--
+ src/broadcom/compiler/vir_to_qpu.c            |  2 +-
+ src/broadcom/qpu/qpu_disasm.c                 |  2 +-
+ src/broadcom/qpu/qpu_instr.h                  |  2 +-
+ src/broadcom/qpu/qpu_pack.c                   | 22 +++++++++----------
+ 6 files changed, 27 insertions(+), 27 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 3b32b48f86f..a10fa03ed10 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -160,7 +160,7 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
+                 break;
+         case V3D_QPU_MUX_B:
+-                if (!n->inst->qpu.sig.small_imm) {
++                if (!n->inst->qpu.sig.small_imm_b) {
+                         add_read_dep(state,
+                                      state->last_rf[n->inst->qpu.raddr_b], n);
+                 }
+@@ -615,7 +615,7 @@ qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
+               return true;
+ 
+         if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+-            !inst->sig.small_imm && (inst->raddr_b == waddr))
++            !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+               return true;
+ 
+         return false;
+@@ -790,11 +790,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
+         uint64_t raddrs_used = 0;
+         if (v3d_qpu_uses_mux(a, V3D_QPU_MUX_A))
+                 raddrs_used |= (1ll << a->raddr_a);
+-        if (!a->sig.small_imm && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
++        if (!a->sig.small_imm_b && v3d_qpu_uses_mux(a, V3D_QPU_MUX_B))
+                 raddrs_used |= (1ll << a->raddr_b);
+         if (v3d_qpu_uses_mux(b, V3D_QPU_MUX_A))
+                 raddrs_used |= (1ll << b->raddr_a);
+-        if (!b->sig.small_imm && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
++        if (!b->sig.small_imm_b && v3d_qpu_uses_mux(b, V3D_QPU_MUX_B))
+                 raddrs_used |= (1ll << b->raddr_b);
+ 
+         return raddrs_used;
+@@ -816,16 +816,16 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         if (naddrs > 2)
+                 return false;
+ 
+-        if ((add_instr->sig.small_imm || mul_instr->sig.small_imm)) {
++        if ((add_instr->sig.small_imm_b || mul_instr->sig.small_imm_b)) {
+                 if (naddrs > 1)
+                         return false;
+ 
+-                if (add_instr->sig.small_imm && mul_instr->sig.small_imm)
++                if (add_instr->sig.small_imm_b && mul_instr->sig.small_imm_b)
+                         if (add_instr->raddr_b != mul_instr->raddr_b)
+                                 return false;
+ 
+-                result->sig.small_imm = true;
+-                result->raddr_b = add_instr->sig.small_imm ?
++                result->sig.small_imm_b = true;
++                result->raddr_b = add_instr->sig.small_imm_b ?
+                         add_instr->raddr_b : mul_instr->raddr_b;
+         }
+ 
+@@ -836,7 +836,7 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         raddrs_used &= ~(1ll << raddr_a);
+         result->raddr_a = raddr_a;
+ 
+-        if (!result->sig.small_imm) {
++        if (!result->sig.small_imm_b) {
+                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
+                     raddr_a == add_instr->raddr_b) {
+                         if (add_instr->alu.add.a == V3D_QPU_MUX_B)
+@@ -1025,7 +1025,7 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+         merge.sig.ldtmu |= b->sig.ldtmu;
+         merge.sig.ldvary |= b->sig.ldvary;
+         merge.sig.ldvpm |= b->sig.ldvpm;
+-        merge.sig.small_imm |= b->sig.small_imm;
++        merge.sig.small_imm_b |= b->sig.small_imm_b;
+         merge.sig.ldtlb |= b->sig.ldtlb;
+         merge.sig.ldtlbu |= b->sig.ldtlbu;
+         merge.sig.ucb |= b->sig.ucb;
+@@ -1614,7 +1614,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+                         return false;
+ 
+                 if (inst->raddr_b < 3 &&
+-                    !inst->sig.small_imm &&
++                    !inst->sig.small_imm_b &&
+                     v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+                         return false;
+                 }
+diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
+index 47d7722968d..df0d6c36c9b 100644
+--- a/src/broadcom/compiler/vir_opt_small_immediates.c
++++ b/src/broadcom/compiler/vir_opt_small_immediates.c
+@@ -80,7 +80,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                          */
+                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
+                         uint32_t sig_packed;
+-                        new_sig.small_imm = true;
++                        new_sig.small_imm_b = true;
+                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
+                                 continue;
+ 
+@@ -89,7 +89,7 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                                 vir_dump_inst(c, inst);
+                                 fprintf(stderr, "\n");
+                         }
+-                        inst->qpu.sig.small_imm = true;
++                        inst->qpu.sig.small_imm_b = true;
+                         inst->qpu.raddr_b = packed;
+ 
+                         inst->src[i].file = QFILE_SMALL_IMM;
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index 45e6bfa1470..15c2e3674c2 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -94,7 +94,7 @@ static void
+ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+ {
+         if (src.smimm) {
+-                assert(instr->sig.small_imm);
++                assert(instr->sig.small_imm_b);
+                 *mux = V3D_QPU_MUX_B;
+                 return;
+         }
+diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
+index 28fb2357b97..6aca3c28e78 100644
+--- a/src/broadcom/qpu/qpu_disasm.c
++++ b/src/broadcom/qpu/qpu_disasm.c
+@@ -62,7 +62,7 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+         if (mux == V3D_QPU_MUX_A) {
+                 append(disasm, "rf%d", instr->raddr_a);
+         } else if (mux == V3D_QPU_MUX_B) {
+-                if (instr->sig.small_imm) {
++                if (instr->sig.small_imm_b) {
+                         uint32_t val;
+                         ASSERTED bool ok =
+                                 v3d_qpu_small_imm_unpack(disasm->devinfo,
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 19bf721dbe1..9cd831863b4 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -50,7 +50,7 @@ struct v3d_qpu_sig {
+         bool ldvpm:1;
+         bool ldtlb:1;
+         bool ldtlbu:1;
+-        bool small_imm:1;
++        bool small_imm_b:1;
+         bool ucb:1;
+         bool rotate:1;
+         bool wrtmuc:1;
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index a875683c6f8..beac591d3c1 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -112,7 +112,7 @@
+ #define LDTMU .ldtmu = true
+ #define LDVARY .ldvary = true
+ #define LDVPM .ldvpm = true
+-#define SMIMM .small_imm = true
++#define SMIMM_B .small_imm_b = true
+ #define LDTLB .ldtlb = true
+ #define LDTLBU .ldtlbu = true
+ #define UCB .ucb = true
+@@ -135,8 +135,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
+         [11] = { THRSW, LDVARY,         LDUNIF },
+         [12] = {        LDVARY, LDTMU,         },
+         [13] = { THRSW, LDVARY, LDTMU,         },
+-        [14] = { SMIMM, LDVARY,                },
+-        [15] = { SMIMM,                        },
++        [14] = { SMIMM_B, LDVARY,              },
++        [15] = { SMIMM_B,                      },
+         [16] = {        LDTLB,                 },
+         [17] = {        LDTLBU,                },
+         /* 18-21 reserved */
+@@ -148,8 +148,8 @@ static const struct v3d_qpu_sig v33_sig_map[] = {
+         [27] = { THRSW, LDVPM,          LDUNIF },
+         [28] = {        LDVPM, LDTMU,          },
+         [29] = { THRSW, LDVPM, LDTMU,          },
+-        [30] = { SMIMM, LDVPM,                 },
+-        [31] = { SMIMM,                        },
++        [30] = { SMIMM_B, LDVPM,               },
++        [31] = { SMIMM_B,                      },
+ };
+ 
+ static const struct v3d_qpu_sig v40_sig_map[] = {
+@@ -167,8 +167,8 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
+         [10] = {        LDVARY,         LDUNIF },
+         [11] = { THRSW, LDVARY,         LDUNIF },
+         /* 12-13 reserved */
+-        [14] = { SMIMM, LDVARY,                },
+-        [15] = { SMIMM,                        },
++        [14] = { SMIMM_B, LDVARY,              },
++        [15] = { SMIMM_B,                      },
+         [16] = {        LDTLB,                 },
+         [17] = {        LDTLBU,                },
+         [18] = {                        WRTMUC },
+@@ -178,7 +178,7 @@ static const struct v3d_qpu_sig v40_sig_map[] = {
+         [22] = { UCB,                          },
+         [23] = { ROT,                          },
+         /* 24-30 reserved */
+-        [31] = { SMIMM,         LDTMU,         },
++        [31] = { SMIMM_B,       LDTMU,         },
+ };
+ 
+ static const struct v3d_qpu_sig v41_sig_map[] = {
+@@ -197,8 +197,8 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
+         [11] = { THRSW,    LDVARY, LDUNIF },
+         [12] = { LDUNIFRF                 },
+         [13] = { THRSW,    LDUNIFRF       },
+-        [14] = { SMIMM,    LDVARY,        },
+-        [15] = { SMIMM,                   },
++        [14] = { SMIMM_B,    LDVARY       },
++        [15] = { SMIMM_B,                 },
+         [16] = {           LDTLB,         },
+         [17] = {           LDTLBU,        },
+         [18] = {                          WRTMUC },
+@@ -210,7 +210,7 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
+         [24] = {                   LDUNIFA},
+         [25] = { LDUNIFARF                },
+         /* 26-30 reserved */
+-        [31] = { SMIMM,            LDTMU, },
++        [31] = { SMIMM_B,          LDTMU, },
+ };
+ 
+ bool
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch b/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch
new file mode 100644
index 0000000000..02e8c47d7e
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0009-broadcom-compiler-add-small_imm-a-c-d-on-v3d_qpu_sig.patch
@@ -0,0 +1,53 @@
+From 0e87405fe73694c173b7ce14c3d60611f241922c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 5 Aug 2021 00:50:12 +0200
+Subject: [PATCH 009/142] broadcom/compiler: add small_imm a/c/d on v3d_qpu_sig
+
+small_imm_a, small_imm_c and small_imm_d added on top of the already
+existing small_imm_b, as V3D 7.1 defines 4 small immediates, tied to
+the 4 raddr. Note that this is only the definition, and just a inst
+validation rule to check that are not used before v71. Any real use is
+still pending.
+---
+ src/broadcom/compiler/qpu_validate.c | 5 +++++
+ src/broadcom/qpu/qpu_instr.h         | 5 ++++-
+ 2 files changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 2cc7a0eb0ae..12788692432 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -115,6 +115,11 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                 return;
+ 
++        if (devinfo->ver < 71) {
++           if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
++              fail_instr(state, "small imm a/c/d added after V3D 7.1");
++        }
++
+         /* LDVARY writes r5 two instructions later and LDUNIF writes
+          * r5 one instruction later, which is illegal to have
+          * together.
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 9cd831863b4..13b3f37d43f 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -50,10 +50,13 @@ struct v3d_qpu_sig {
+         bool ldvpm:1;
+         bool ldtlb:1;
+         bool ldtlbu:1;
+-        bool small_imm_b:1;
+         bool ucb:1;
+         bool rotate:1;
+         bool wrtmuc:1;
++        bool small_imm_a:1; /* raddr_a (add a), since V3D 7.x */
++        bool small_imm_b:1; /* raddr_b (add b) */
++        bool small_imm_c:1; /* raddr_c (mul a), since V3D 7.x */
++        bool small_imm_d:1; /* raddr_d (mul b), since V3D 7.x */
+ };
+ 
+ enum v3d_qpu_cond {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch b/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch
new file mode 100644
index 0000000000..a2d2598b9f
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0010-broadcom-qpu-add-v71-signal-map.patch
@@ -0,0 +1,106 @@
+From eca19c911d9af3b0ab3b563ea65dc455e3d27987 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 4 Aug 2021 01:11:16 +0200
+Subject: [PATCH 010/142] broadcom/qpu: add v71 signal map
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Compared with v41, the differences are:
+   * 14, 15, 29 and 30 are now about immediate a, b, c, d respectively
+   * 23 is now reserved. On v42 this was for rotate signals, that are
+     gone on v71.
+
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+---
+ src/broadcom/qpu/qpu_pack.c | 47 ++++++++++++++++++++++++++++++++++---
+ 1 file changed, 44 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index beac591d3c1..2820d9d4c56 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -112,12 +112,15 @@
+ #define LDTMU .ldtmu = true
+ #define LDVARY .ldvary = true
+ #define LDVPM .ldvpm = true
+-#define SMIMM_B .small_imm_b = true
+ #define LDTLB .ldtlb = true
+ #define LDTLBU .ldtlbu = true
+ #define UCB .ucb = true
+ #define ROT .rotate = true
+ #define WRTMUC .wrtmuc = true
++#define SMIMM_A .small_imm_a = true
++#define SMIMM_B .small_imm_b = true
++#define SMIMM_C .small_imm_c = true
++#define SMIMM_D .small_imm_d = true
+ 
+ static const struct v3d_qpu_sig v33_sig_map[] = {
+         /*      MISC   R3       R4      R5 */
+@@ -213,6 +216,40 @@ static const struct v3d_qpu_sig v41_sig_map[] = {
+         [31] = { SMIMM_B,          LDTMU, },
+ };
+ 
++
++static const struct v3d_qpu_sig v71_sig_map[] = {
++        /*      MISC       phys    RF0 */
++        [0]  = {                          },
++        [1]  = { THRSW,                   },
++        [2]  = {                   LDUNIF },
++        [3]  = { THRSW,            LDUNIF },
++        [4]  = {           LDTMU,         },
++        [5]  = { THRSW,    LDTMU,         },
++        [6]  = {           LDTMU,  LDUNIF },
++        [7]  = { THRSW,    LDTMU,  LDUNIF },
++        [8]  = {           LDVARY,        },
++        [9]  = { THRSW,    LDVARY,        },
++        [10] = {           LDVARY, LDUNIF },
++        [11] = { THRSW,    LDVARY, LDUNIF },
++        [12] = { LDUNIFRF                 },
++        [13] = { THRSW,    LDUNIFRF       },
++        [14] = { SMIMM_A,                 },
++        [15] = { SMIMM_B,                 },
++        [16] = {           LDTLB,         },
++        [17] = {           LDTLBU,        },
++        [18] = {                          WRTMUC },
++        [19] = { THRSW,                   WRTMUC },
++        [20] = {           LDVARY,        WRTMUC },
++        [21] = { THRSW,    LDVARY,        WRTMUC },
++        [22] = { UCB,                     },
++        /* 23 reserved */
++        [24] = {                   LDUNIFA},
++        [25] = { LDUNIFARF                },
++        /* 26-29 reserved */
++        [30] = { SMIMM_C,                 },
++        [31] = { SMIMM_D,                 },
++};
++
+ bool
+ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
+                    uint32_t packed_sig,
+@@ -221,7 +258,9 @@ v3d_qpu_sig_unpack(const struct v3d_device_info *devinfo,
+         if (packed_sig >= ARRAY_SIZE(v33_sig_map))
+                 return false;
+ 
+-        if (devinfo->ver >= 41)
++        if (devinfo->ver >= 71)
++                *sig = v71_sig_map[packed_sig];
++        else if (devinfo->ver >= 41)
+                 *sig = v41_sig_map[packed_sig];
+         else if (devinfo->ver == 40)
+                 *sig = v40_sig_map[packed_sig];
+@@ -240,7 +279,9 @@ v3d_qpu_sig_pack(const struct v3d_device_info *devinfo,
+ {
+         static const struct v3d_qpu_sig *map;
+ 
+-        if (devinfo->ver >= 41)
++        if (devinfo->ver >= 71)
++                map = v71_sig_map;
++        else if (devinfo->ver >= 41)
+                 map = v41_sig_map;
+         else if (devinfo->ver == 40)
+                 map = v40_sig_map;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch b/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch
new file mode 100644
index 0000000000..d5813b8c05
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0011-broadcom-qpu-define-v3d_qpu_input-use-on-v3d_qpu_alu.patch
@@ -0,0 +1,778 @@
+From d10e67a396d713ec81fb133f3516e09fe1e067b6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 6 Aug 2021 01:22:31 +0200
+Subject: [PATCH 011/142] broadcom/qpu: define v3d_qpu_input, use on
+ v3d_qpu_alu_instr
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+At this point it just tidy up a little the alu_instr structure.
+
+But also serves to prepare the structure for new changes, as 7.x uses
+raddr instead of mux, and it is just easier to add the raddr to the
+new input structure.
+
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+---
+ src/broadcom/compiler/qpu_schedule.c          | 65 +++++++--------
+ src/broadcom/compiler/vir.c                   | 16 ++--
+ src/broadcom/compiler/vir_dump.c              |  8 +-
+ .../compiler/vir_opt_copy_propagate.c         | 12 +--
+ .../compiler/vir_opt_redundant_flags.c        |  8 +-
+ src/broadcom/compiler/vir_to_qpu.c            | 30 +++----
+ src/broadcom/qpu/qpu_disasm.c                 | 16 ++--
+ src/broadcom/qpu/qpu_instr.c                  |  8 +-
+ src/broadcom/qpu/qpu_instr.h                  | 13 +--
+ src/broadcom/qpu/qpu_pack.c                   | 82 +++++++++----------
+ src/broadcom/qpu/tests/qpu_disasm.c           |  8 +-
+ 11 files changed, 134 insertions(+), 132 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index a10fa03ed10..455fa3867be 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -306,14 +306,14 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+         /* XXX: LOAD_IMM */
+ 
+         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
+-                process_mux_deps(state, n, inst->alu.add.a);
++                process_mux_deps(state, n, inst->alu.add.a.mux);
+         if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
+-                process_mux_deps(state, n, inst->alu.add.b);
++                process_mux_deps(state, n, inst->alu.add.b.mux);
+ 
+         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
+-                process_mux_deps(state, n, inst->alu.mul.a);
++                process_mux_deps(state, n, inst->alu.mul.a.mux);
+         if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
+-                process_mux_deps(state, n, inst->alu.mul.b);
++                process_mux_deps(state, n, inst->alu.mul.b.mux);
+ 
+         switch (inst->alu.add.op) {
+         case V3D_QPU_A_VPMSETUP:
+@@ -537,22 +537,22 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+ 
+         if (inst->alu.add.op != V3D_QPU_A_NOP) {
+                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a)) {
++                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
+                         return true;
+                 }
+                 if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b)) {
++                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
+                         return true;
+                 }
+         }
+ 
+         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a)) {
++                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
+                         return true;
+                 }
+                 if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b)) {
++                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
+                         return true;
+                 }
+         }
+@@ -839,20 +839,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         if (!result->sig.small_imm_b) {
+                 if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_B) &&
+                     raddr_a == add_instr->raddr_b) {
+-                        if (add_instr->alu.add.a == V3D_QPU_MUX_B)
+-                                result->alu.add.a = V3D_QPU_MUX_A;
+-                        if (add_instr->alu.add.b == V3D_QPU_MUX_B &&
++                        if (add_instr->alu.add.a.mux == V3D_QPU_MUX_B)
++                                result->alu.add.a.mux = V3D_QPU_MUX_A;
++                        if (add_instr->alu.add.b.mux == V3D_QPU_MUX_B &&
+                             v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
+-                                result->alu.add.b = V3D_QPU_MUX_A;
++                                result->alu.add.b.mux = V3D_QPU_MUX_A;
+                         }
+                 }
+                 if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_B) &&
+                     raddr_a == mul_instr->raddr_b) {
+-                        if (mul_instr->alu.mul.a == V3D_QPU_MUX_B)
+-                                result->alu.mul.a = V3D_QPU_MUX_A;
+-                        if (mul_instr->alu.mul.b == V3D_QPU_MUX_B &&
++                        if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_B)
++                                result->alu.mul.a.mux = V3D_QPU_MUX_A;
++                        if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_B &&
+                             v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
+-                                result->alu.mul.b = V3D_QPU_MUX_A;
++                                result->alu.mul.b.mux = V3D_QPU_MUX_A;
+                         }
+                 }
+         }
+@@ -863,20 +863,20 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+         result->raddr_b = raddr_b;
+         if (v3d_qpu_uses_mux(add_instr, V3D_QPU_MUX_A) &&
+             raddr_b == add_instr->raddr_a) {
+-                if (add_instr->alu.add.a == V3D_QPU_MUX_A)
+-                        result->alu.add.a = V3D_QPU_MUX_B;
+-                if (add_instr->alu.add.b == V3D_QPU_MUX_A &&
++                if (add_instr->alu.add.a.mux == V3D_QPU_MUX_A)
++                        result->alu.add.a.mux = V3D_QPU_MUX_B;
++                if (add_instr->alu.add.b.mux == V3D_QPU_MUX_A &&
+                     v3d_qpu_add_op_num_src(add_instr->alu.add.op) > 1) {
+-                        result->alu.add.b = V3D_QPU_MUX_B;
++                        result->alu.add.b.mux = V3D_QPU_MUX_B;
+                 }
+         }
+         if (v3d_qpu_uses_mux(mul_instr, V3D_QPU_MUX_A) &&
+             raddr_b == mul_instr->raddr_a) {
+-                if (mul_instr->alu.mul.a == V3D_QPU_MUX_A)
+-                        result->alu.mul.a = V3D_QPU_MUX_B;
+-                if (mul_instr->alu.mul.b == V3D_QPU_MUX_A &&
++                if (mul_instr->alu.mul.a.mux == V3D_QPU_MUX_A)
++                        result->alu.mul.a.mux = V3D_QPU_MUX_B;
++                if (mul_instr->alu.mul.b.mux == V3D_QPU_MUX_A &&
+                     v3d_qpu_mul_op_num_src(mul_instr->alu.mul.op) > 1) {
+-                        result->alu.mul.b = V3D_QPU_MUX_B;
++                        result->alu.mul.b.mux = V3D_QPU_MUX_B;
+                 }
+         }
+ 
+@@ -927,11 +927,12 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+         inst->flags.auf = V3D_QPU_UF_NONE;
+ 
+         inst->alu.mul.output_pack = inst->alu.add.output_pack;
+-        inst->alu.mul.a_unpack = inst->alu.add.a_unpack;
+-        inst->alu.mul.b_unpack = inst->alu.add.b_unpack;
++
++        inst->alu.mul.a.unpack = inst->alu.add.a.unpack;
++        inst->alu.mul.b.unpack = inst->alu.add.b.unpack;
+         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-        inst->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
+-        inst->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ }
+ 
+ static bool
+@@ -2064,12 +2065,12 @@ alu_reads_register(struct v3d_qpu_instr *inst,
+ 
+         if (add) {
+                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
+-                mux_a = inst->alu.add.a;
+-                mux_b = inst->alu.add.b;
++                mux_a = inst->alu.add.a.mux;
++                mux_b = inst->alu.add.b.mux;
+         } else {
+                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+-                mux_a = inst->alu.mul.a;
+-                mux_b = inst->alu.mul.b;
++                mux_a = inst->alu.mul.a.mux;
++                mux_b = inst->alu.mul.b.mux;
+         }
+ 
+         for (int i = 0; i < num_src; i++) {
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index 660b11b0577..007cb0a941b 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -113,10 +113,10 @@ vir_is_raw_mov(struct qinst *inst)
+                 return false;
+         }
+ 
+-        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
+-            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
++        if (inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++            inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE) {
+                 return false;
+         }
+ 
+@@ -209,15 +209,15 @@ vir_set_unpack(struct qinst *inst, int src,
+ 
+         if (vir_is_add(inst)) {
+                 if (src == 0)
+-                        inst->qpu.alu.add.a_unpack = unpack;
++                        inst->qpu.alu.add.a.unpack = unpack;
+                 else
+-                        inst->qpu.alu.add.b_unpack = unpack;
++                        inst->qpu.alu.add.b.unpack = unpack;
+         } else {
+                 assert(vir_is_mul(inst));
+                 if (src == 0)
+-                        inst->qpu.alu.mul.a_unpack = unpack;
++                        inst->qpu.alu.mul.a.unpack = unpack;
+                 else
+-                        inst->qpu.alu.mul.b_unpack = unpack;
++                        inst->qpu.alu.mul.b.unpack = unpack;
+         }
+ }
+ 
+diff --git a/src/broadcom/compiler/vir_dump.c b/src/broadcom/compiler/vir_dump.c
+index 5c47bbdc1b0..ab5d4043039 100644
+--- a/src/broadcom/compiler/vir_dump.c
++++ b/src/broadcom/compiler/vir_dump.c
+@@ -270,8 +270,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+                 vir_print_reg(c, inst, inst->dst);
+                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.add.output_pack));
+ 
+-                unpack[0] = instr->alu.add.a_unpack;
+-                unpack[1] = instr->alu.add.b_unpack;
++                unpack[0] = instr->alu.add.a.unpack;
++                unpack[1] = instr->alu.add.b.unpack;
+         } else {
+                 fprintf(stderr, "%s", v3d_qpu_mul_op_name(instr->alu.mul.op));
+                 fprintf(stderr, "%s", v3d_qpu_cond_name(instr->flags.mc));
+@@ -282,8 +282,8 @@ vir_dump_alu(struct v3d_compile *c, struct qinst *inst)
+                 vir_print_reg(c, inst, inst->dst);
+                 fprintf(stderr, "%s", v3d_qpu_pack_name(instr->alu.mul.output_pack));
+ 
+-                unpack[0] = instr->alu.mul.a_unpack;
+-                unpack[1] = instr->alu.mul.b_unpack;
++                unpack[0] = instr->alu.mul.a.unpack;
++                unpack[1] = instr->alu.mul.b.unpack;
+         }
+ 
+         for (int i = 0; i < nsrc; i++) {
+diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
+index da121c2a5bd..c4aa7255a17 100644
+--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
+@@ -104,14 +104,14 @@ vir_has_unpack(struct qinst *inst, int chan)
+ 
+         if (vir_is_add(inst)) {
+                 if (chan == 0)
+-                        return inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.add.a.unpack != V3D_QPU_UNPACK_NONE;
+                 else
+-                        return inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.add.b.unpack != V3D_QPU_UNPACK_NONE;
+         } else {
+                 if (chan == 0)
+-                        return inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE;
+                 else
+-                        return inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE;
++                        return inst->qpu.alu.mul.b.unpack != V3D_QPU_UNPACK_NONE;
+         }
+ }
+ 
+@@ -161,7 +161,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+                                 continue;
+ 
+                         /* these ops can't represent abs. */
+-                        if (mov->qpu.alu.mul.a_unpack == V3D_QPU_UNPACK_ABS) {
++                        if (mov->qpu.alu.mul.a.unpack == V3D_QPU_UNPACK_ABS) {
+                                 switch (inst->qpu.alu.add.op) {
+                                 case V3D_QPU_A_VFPACK:
+                                 case V3D_QPU_A_FROUND:
+@@ -189,7 +189,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+ 
+                 inst->src[i] = mov->src[0];
+                 if (vir_has_unpack(mov, 0)) {
+-                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a_unpack;
++                        enum v3d_qpu_input_unpack unpack = mov->qpu.alu.mul.a.unpack;
+ 
+                         vir_set_unpack(inst, i, unpack);
+                 }
+diff --git a/src/broadcom/compiler/vir_opt_redundant_flags.c b/src/broadcom/compiler/vir_opt_redundant_flags.c
+index c7896d57f2b..6b61ed6a39a 100644
+--- a/src/broadcom/compiler/vir_opt_redundant_flags.c
++++ b/src/broadcom/compiler/vir_opt_redundant_flags.c
+@@ -81,11 +81,11 @@ vir_instr_flags_op_equal(struct qinst *a, struct qinst *b)
+             a->qpu.flags.mpf != b->qpu.flags.mpf ||
+             a->qpu.alu.add.op != b->qpu.alu.add.op ||
+             a->qpu.alu.mul.op != b->qpu.alu.mul.op ||
+-            a->qpu.alu.add.a_unpack != b->qpu.alu.add.a_unpack ||
+-            a->qpu.alu.add.b_unpack != b->qpu.alu.add.b_unpack ||
++            a->qpu.alu.add.a.unpack != b->qpu.alu.add.a.unpack ||
++            a->qpu.alu.add.b.unpack != b->qpu.alu.add.b.unpack ||
+             a->qpu.alu.add.output_pack != b->qpu.alu.add.output_pack ||
+-            a->qpu.alu.mul.a_unpack != b->qpu.alu.mul.a_unpack ||
+-            a->qpu.alu.mul.b_unpack != b->qpu.alu.mul.b_unpack ||
++            a->qpu.alu.mul.a.unpack != b->qpu.alu.mul.a.unpack ||
++            a->qpu.alu.mul.b.unpack != b->qpu.alu.mul.b.unpack ||
+             a->qpu.alu.mul.output_pack != b->qpu.alu.mul.output_pack) {
+                 return false;
+         }
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index 15c2e3674c2..c8b6e0a91a0 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -106,20 +106,20 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+                 return;
+         }
+ 
+-        if (instr->alu.add.a != V3D_QPU_MUX_A &&
+-            instr->alu.add.b != V3D_QPU_MUX_A &&
+-            instr->alu.mul.a != V3D_QPU_MUX_A &&
+-            instr->alu.mul.b != V3D_QPU_MUX_A) {
++        if (instr->alu.add.a.mux != V3D_QPU_MUX_A &&
++            instr->alu.add.b.mux != V3D_QPU_MUX_A &&
++            instr->alu.mul.a.mux != V3D_QPU_MUX_A &&
++            instr->alu.mul.b.mux != V3D_QPU_MUX_A) {
+                 instr->raddr_a = src.index;
+                 *mux = V3D_QPU_MUX_A;
+         } else {
+                 if (instr->raddr_a == src.index) {
+                         *mux = V3D_QPU_MUX_A;
+                 } else {
+-                        assert(!(instr->alu.add.a == V3D_QPU_MUX_B &&
+-                                 instr->alu.add.b == V3D_QPU_MUX_B &&
+-                                 instr->alu.mul.a == V3D_QPU_MUX_B &&
+-                                 instr->alu.mul.b == V3D_QPU_MUX_B) ||
++                        assert(!(instr->alu.add.a.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.add.b.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.mul.a.mux == V3D_QPU_MUX_B &&
++                                 instr->alu.mul.b.mux == V3D_QPU_MUX_B) ||
+                                src.index == instr->raddr_b);
+ 
+                         instr->raddr_b = src.index;
+@@ -147,14 +147,14 @@ is_no_op_mov(struct qinst *qinst)
+                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
+                         return false;
+ 
+-                if (qinst->qpu.alu.mul.a !=
++                if (qinst->qpu.alu.mul.a.mux !=
+                     V3D_QPU_MUX_R0 + (waddr - V3D_QPU_WADDR_R0)) {
+                         return false;
+                 }
+         } else {
+                 int raddr;
+ 
+-                switch (qinst->qpu.alu.mul.a) {
++                switch (qinst->qpu.alu.mul.a.mux) {
+                 case V3D_QPU_MUX_A:
+                         raddr = qinst->qpu.raddr_a;
+                         break;
+@@ -171,7 +171,7 @@ is_no_op_mov(struct qinst *qinst)
+         /* No packing or flags updates, or we need to execute the
+          * instruction.
+          */
+-        if (qinst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
++        if (qinst->qpu.alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
+             qinst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE ||
+             qinst->qpu.flags.mc != V3D_QPU_COND_NONE ||
+             qinst->qpu.flags.mpf != V3D_QPU_PF_NONE ||
+@@ -302,11 +302,11 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.a, src[0]);
++                                                &qinst->qpu.alu.add.a.mux, src[0]);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.b, src[1]);
++                                                &qinst->qpu.alu.add.b.mux, src[1]);
+                                 }
+ 
+                                 qinst->qpu.alu.add.waddr = dst.index;
+@@ -314,11 +314,11 @@ v3d_generate_code_block(struct v3d_compile *c,
+                         } else {
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.a, src[0]);
++                                                &qinst->qpu.alu.mul.a.mux, src[0]);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.b, src[1]);
++                                                &qinst->qpu.alu.mul.b.mux, src[1]);
+                                 }
+ 
+                                 qinst->qpu.alu.mul.waddr = dst.index;
+diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
+index 6aca3c28e78..588a665f770 100644
+--- a/src/broadcom/qpu/qpu_disasm.c
++++ b/src/broadcom/qpu/qpu_disasm.c
+@@ -121,16 +121,16 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a);
++                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.add.a_unpack));
++                       v3d_qpu_unpack_name(instr->alu.add.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b);
++                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.add.b_unpack));
++                       v3d_qpu_unpack_name(instr->alu.add.b.unpack));
+         }
+ }
+ 
+@@ -164,16 +164,16 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a);
++                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.mul.a_unpack));
++                       v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b);
++                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
+                 append(disasm, "%s",
+-                       v3d_qpu_unpack_name(instr->alu.mul.b_unpack));
++                       v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
+         }
+ }
+ 
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 7759fb0efdf..7ece8b5e570 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -926,10 +926,10 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+         int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+         int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+ 
+-        return ((add_nsrc > 0 && inst->alu.add.a == mux) ||
+-                (add_nsrc > 1 && inst->alu.add.b == mux) ||
+-                (mul_nsrc > 0 && inst->alu.mul.a == mux) ||
+-                (mul_nsrc > 1 && inst->alu.mul.b == mux));
++        return ((add_nsrc > 0 && inst->alu.add.a.mux == mux) ||
++                (add_nsrc > 1 && inst->alu.add.b.mux == mux) ||
++                (mul_nsrc > 0 && inst->alu.mul.a.mux == mux) ||
++                (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+ }
+ 
+ bool
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 13b3f37d43f..53a51bfb3e1 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -294,25 +294,26 @@ enum v3d_qpu_mux {
+         V3D_QPU_MUX_B,
+ };
+ 
++struct v3d_qpu_input {
++        enum v3d_qpu_mux mux;
++        enum v3d_qpu_input_unpack unpack;
++};
++
+ struct v3d_qpu_alu_instr {
+         struct {
+                 enum v3d_qpu_add_op op;
+-                enum v3d_qpu_mux a, b;
++                struct v3d_qpu_input a, b;
+                 uint8_t waddr;
+                 bool magic_write;
+                 enum v3d_qpu_output_pack output_pack;
+-                enum v3d_qpu_input_unpack a_unpack;
+-                enum v3d_qpu_input_unpack b_unpack;
+         } add;
+ 
+         struct {
+                 enum v3d_qpu_mul_op op;
+-                enum v3d_qpu_mux a, b;
++                struct v3d_qpu_input a, b;
+                 uint8_t waddr;
+                 bool magic_write;
+                 enum v3d_qpu_output_pack output_pack;
+-                enum v3d_qpu_input_unpack a_unpack;
+-                enum v3d_qpu_input_unpack b_unpack;
+         } mul;
+ };
+ 
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 2820d9d4c56..6e975793fc0 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -853,12 +853,12 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                         instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+-                                                   &instr->alu.add.b_unpack)) {
++                                                   &instr->alu.add.b.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -872,7 +872,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.add.output_pack = mux_b & 0x3;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -884,7 +884,7 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+                 break;
+@@ -892,23 +892,23 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+         case V3D_QPU_A_VFMIN:
+         case V3D_QPU_A_VFMAX:
+                 if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
+-                                                   &instr->alu.add.a_unpack)) {
++                                                   &instr->alu.add.a.unpack)) {
+                         return false;
+                 }
+ 
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+ 
+         default:
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.add.a_unpack = V3D_QPU_UNPACK_NONE;
+-                instr->alu.add.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+         }
+ 
+-        instr->alu.add.a = mux_a;
+-        instr->alu.add.b = mux_b;
++        instr->alu.add.a.mux = mux_a;
++        instr->alu.add.b.mux = mux_b;
+         instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
+ 
+         instr->alu.add.magic_write = false;
+@@ -956,12 +956,12 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+                 if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
+-                                                   &instr->alu.mul.b_unpack)) {
++                                                   &instr->alu.mul.b.unpack)) {
+                         return false;
+                 }
+ 
+@@ -972,7 +972,7 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                                               ((mux_b >> 2) & 1));
+ 
+                 if (!v3d_qpu_float32_unpack_unpack(mux_b & 0x3,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+@@ -982,23 +982,23 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+ 
+                 if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
+-                                                   &instr->alu.mul.a_unpack)) {
++                                                   &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+ 
+-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+ 
+                 break;
+ 
+         default:
+                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+-                instr->alu.mul.a_unpack = V3D_QPU_UNPACK_NONE;
+-                instr->alu.mul.b_unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+         }
+ 
+-        instr->alu.mul.a = mux_a;
+-        instr->alu.mul.b = mux_b;
++        instr->alu.mul.a.mux = mux_a;
++        instr->alu.mul.b.mux = mux_b;
+         instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
+         instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
+ 
+@@ -1030,8 +1030,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                  const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+         uint32_t waddr = instr->alu.add.waddr;
+-        uint32_t mux_a = instr->alu.add.a;
+-        uint32_t mux_b = instr->alu.add.b;
++        uint32_t mux_a = instr->alu.add.a.mux;
++        uint32_t mux_b = instr->alu.add.b.mux;
+         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+         const struct opcode_desc *desc =
+                 lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
+@@ -1102,12 +1102,12 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 }
+                 opcode |= output_pack << 4;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &a_unpack)) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                  &b_unpack)) {
+                         return false;
+                 }
+@@ -1141,17 +1141,17 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 uint32_t a_unpack;
+                 uint32_t b_unpack;
+ 
+-                if (instr->alu.add.a_unpack == V3D_QPU_UNPACK_ABS ||
+-                    instr->alu.add.b_unpack == V3D_QPU_UNPACK_ABS) {
++                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
++                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &a_unpack)) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
+                                                  &b_unpack)) {
+                         return false;
+                 }
+@@ -1176,7 +1176,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 }
+                 mux_b |= packed;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1194,7 +1194,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                         return false;
+ 
+                 uint32_t packed;
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1207,11 +1207,11 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+         case V3D_QPU_A_VFMIN:
+         case V3D_QPU_A_VFMAX:
+                 if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+-                    instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE) {
++                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
+                         return false;
+                 }
+ 
+-                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a_unpack,
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1221,8 +1221,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+         default:
+                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+-                     instr->alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
+-                     instr->alu.add.b_unpack != V3D_QPU_UNPACK_NONE)) {
++                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
+                         return false;
+                 }
+                 break;
+@@ -1242,8 +1242,8 @@ static bool
+ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                  const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+-        uint32_t mux_a = instr->alu.mul.a;
+-        uint32_t mux_b = instr->alu.mul.b;
++        uint32_t mux_a = instr->alu.mul.a.mux;
++        uint32_t mux_b = instr->alu.mul.b.mux;
+         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+ 
+         const struct opcode_desc *desc =
+@@ -1277,13 +1277,13 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                  */
+                 opcode += packed << 4;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+                 opcode |= packed << 2;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1301,7 +1301,7 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 opcode |= (packed >> 1) & 1;
+                 mux_b = (packed & 1) << 2;
+ 
+-                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+@@ -1315,16 +1315,16 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
+                         return false;
+ 
+-                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a_unpack,
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+-                if (instr->alu.mul.a_unpack == V3D_QPU_UNPACK_SWAP_16)
++                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
+                         opcode = 8;
+                 else
+                         opcode |= (packed + 4) & 7;
+ 
+-                if (instr->alu.mul.b_unpack != V3D_QPU_UNPACK_NONE)
++                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
+                         return false;
+ 
+                 break;
+diff --git a/src/broadcom/qpu/tests/qpu_disasm.c b/src/broadcom/qpu/tests/qpu_disasm.c
+index 2f8e19c73fe..be7b78d5ef0 100644
+--- a/src/broadcom/qpu/tests/qpu_disasm.c
++++ b/src/broadcom/qpu/tests/qpu_disasm.c
+@@ -160,10 +160,10 @@ main(int argc, char **argv)
+                                 /* Swap the operands to be sure that we test
+                                  * how the QPUs distinguish between these ops.
+                                  */
+-                                swap_mux(&instr.alu.add.a,
+-                                         &instr.alu.add.b);
+-                                swap_pack(&instr.alu.add.a_unpack,
+-                                          &instr.alu.add.b_unpack);
++                                swap_mux(&instr.alu.add.a.mux,
++                                         &instr.alu.add.b.mux);
++                                swap_pack(&instr.alu.add.a.unpack,
++                                          &instr.alu.add.b.unpack);
+                                 break;
+                         default:
+                                 break;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch b/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch
new file mode 100644
index 0000000000..9c2303f4e4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0012-broadcom-qpu-add-raddr-on-v3d_qpu_input.patch
@@ -0,0 +1,45 @@
+From 52ea09792ff8a438ccdecac47b8415657be90098 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 6 Aug 2021 01:33:32 +0200
+Subject: [PATCH 012/142] broadcom/qpu: add raddr on v3d_qpu_input
+
+On V3D 7.x mux are not used, and raddr_a/b/c/d are used instead
+
+This is not perfect, as for v71, the raddr_a/b defined at qpu_instr
+became superfluous. But the alternative would be to define two
+different structs, or even having them defined based on version
+ifdefs, so this is a reasonable compromise.
+---
+ src/broadcom/qpu/qpu_instr.h | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 53a51bfb3e1..9e56e2d6a99 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -295,7 +295,10 @@ enum v3d_qpu_mux {
+ };
+ 
+ struct v3d_qpu_input {
+-        enum v3d_qpu_mux mux;
++        union {
++                enum v3d_qpu_mux mux; /* V3D 4.x */
++                uint8_t raddr; /* V3D 7.x */
++        };
+         enum v3d_qpu_input_unpack unpack;
+ };
+ 
+@@ -385,8 +388,8 @@ struct v3d_qpu_instr {
+         struct v3d_qpu_sig sig;
+         uint8_t sig_addr;
+         bool sig_magic; /* If the signal writes to a magic address */
+-        uint8_t raddr_a;
+-        uint8_t raddr_b;
++        uint8_t raddr_a; /* V3D 4.x */
++        uint8_t raddr_b; /* V3D 4.x*/
+         struct v3d_qpu_flags flags;
+ 
+         union {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch b/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch
new file mode 100644
index 0000000000..162529e963
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0013-broadcom-qpu-defining-shift-mask-for-raddr_c-d.patch
@@ -0,0 +1,37 @@
+From 3e5ad0881c2789619cdf65f40a44d5481e28e800 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 12 Aug 2021 02:24:02 +0200
+Subject: [PATCH 013/142] broadcom/qpu: defining shift/mask for raddr_c/d
+
+On V3D 7.x it replaces mul_a/b and add_a/b
+---
+ src/broadcom/qpu/qpu_pack.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 6e975793fc0..4f106909729 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -84,6 +84,9 @@
+ #define V3D_QPU_MUL_A_SHIFT                 18
+ #define V3D_QPU_MUL_A_MASK                  QPU_MASK(20, 18)
+ 
++#define V3D_QPU_RADDR_C_SHIFT               18
++#define V3D_QPU_RADDR_C_MASK                QPU_MASK(23, 18)
++
+ #define V3D_QPU_ADD_B_SHIFT                 15
+ #define V3D_QPU_ADD_B_MASK                  QPU_MASK(17, 15)
+ 
+@@ -98,6 +101,9 @@
+ #define V3D_QPU_BRANCH_BDI_SHIFT            12
+ #define V3D_QPU_BRANCH_BDI_MASK             QPU_MASK(13, 12)
+ 
++#define V3D_QPU_RADDR_D_SHIFT               12
++#define V3D_QPU_RADDR_D_MASK                QPU_MASK(17, 12)
++
+ #define V3D_QPU_RADDR_A_SHIFT               6
+ #define V3D_QPU_RADDR_A_MASK                QPU_MASK(11, 6)
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch b/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch
new file mode 100644
index 0000000000..1855816d95
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0014-broadcom-commmon-add-has_accumulators-field-on-v3d_d.patch
@@ -0,0 +1,46 @@
+From 81febf14fe05ad26e992275b911e8bc1e1416ebc Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 17 Sep 2021 01:04:31 +0200
+Subject: [PATCH 014/142] broadcom/commmon: add has_accumulators field on
+ v3d_device_info
+
+Even if we can just check for the version on the code, checking for
+this field makes several places more readable. So for example, on the
+register allocate code we doesn't assign an accumulator because we
+don't have accumulators on that hw, instead of because hw version is a
+given one.
+---
+ src/broadcom/common/v3d_device_info.c | 2 ++
+ src/broadcom/common/v3d_device_info.h | 3 +++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/src/broadcom/common/v3d_device_info.c b/src/broadcom/common/v3d_device_info.c
+index 7512fe3a06b..7bc2b662cfc 100644
+--- a/src/broadcom/common/v3d_device_info.c
++++ b/src/broadcom/common/v3d_device_info.c
+@@ -65,6 +65,8 @@ v3d_get_device_info(int fd, struct v3d_device_info* devinfo, v3d_ioctl_fun drm_i
+     int qups = (ident1.value >> 8) & 0xf;
+     devinfo->qpu_count = nslc * qups;
+ 
++    devinfo->has_accumulators = devinfo->ver < 71;
++
+     switch (devinfo->ver) {
+         case 33:
+         case 41:
+diff --git a/src/broadcom/common/v3d_device_info.h b/src/broadcom/common/v3d_device_info.h
+index 32cb65cf81f..8dfc7858727 100644
+--- a/src/broadcom/common/v3d_device_info.h
++++ b/src/broadcom/common/v3d_device_info.h
+@@ -42,6 +42,9 @@ struct v3d_device_info {
+ 
+         /* NSLC * QUPS from the core's IDENT registers. */
+         int qpu_count;
++
++        /* If the hw has accumulator registers */
++        bool has_accumulators;
+ };
+ 
+ typedef int (*v3d_ioctl_fun)(int fd, unsigned long request, void *arg);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch b/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch
new file mode 100644
index 0000000000..8bd646ac94
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0015-broadcom-qpu-add-qpu_writes_rf0_implicitly-helper.patch
@@ -0,0 +1,52 @@
+From 7d42eca87b6e144697810405308d99d200dca62a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 15 Sep 2021 10:56:43 +0200
+Subject: [PATCH 015/142] broadcom/qpu: add qpu_writes_rf0_implicitly helper
+
+On v71 rf0 replaces r5 as the register that gets updated implicitly
+with uniform loads, and gets the C coefficient with ldvary. This
+helper return if rf0 gets implicitly updated.
+---
+ src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
+ src/broadcom/qpu/qpu_instr.h |  2 ++
+ 2 files changed, 14 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 7ece8b5e570..8de99c611d5 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -920,6 +920,18 @@ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+         return false;
+ }
+ 
++bool
++v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
++                              const struct v3d_qpu_instr *inst)
++{
++        if (devinfo->ver >= 71 &&
++            (inst->sig.ldvary || inst->sig.ldunif || inst->sig.ldunifa)) {
++                return true;
++        }
++
++        return false;
++}
++
+ bool
+ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+ {
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 9e56e2d6a99..a25be8e0ee6 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -473,6 +473,8 @@ bool v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+ bool v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
+                        const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
++bool v3d_qpu_writes_rf0_implicitly(const struct v3d_device_info *devinfo,
++                                   const struct v3d_qpu_instr *instr) ATTRIBUTE_CONST;
+ bool v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+                           const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_waits_on_tmu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch
new file mode 100644
index 0000000000..8afa579075
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0016-broadcom-qpu-add-pack-unpack-support-for-v71.patch
@@ -0,0 +1,1258 @@
+From f0859613bd59e14fb21571e7978bb5c5d5e9c6d7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Sat, 7 Aug 2021 02:20:39 +0200
+Subject: [PATCH 016/142] broadcom/qpu: add pack/unpack support for v71
+
+Note that we provide new v71 alu pack/unpack methods. As there are a
+lot that it is equivalent, initially we tried to use existing methods
+as template and add version checks on the existing methods. At some
+early point that become just really unreadable, so it become better to
+just provide new methods, even if v42 and v71 methods have a really
+similar structure.
+
+Note that we have splitted the op tables, and created a two (add/mul)
+for v71. As the description struct include versioning info, we could
+have just used one table. But, specially with the add table, there are
+a lot of differences with v71. So it is slightly tidier this
+way. Also, taking into account that we do a linear search on the
+tables, this can be even justified by performance.
+---
+ src/broadcom/qpu/qpu_pack.c | 1049 ++++++++++++++++++++++++++++++-----
+ 1 file changed, 904 insertions(+), 145 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 4f106909729..4045275cb9a 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -490,16 +490,26 @@ v3d_qpu_flags_pack(const struct v3d_device_info *devinfo,
+ 
+ /* Make a mapping of the table of opcodes in the spec.  The opcode is
+  * determined by a combination of the opcode field, and in the case of 0 or
+- * 1-arg opcodes, the mux_b field as well.
++ * 1-arg opcodes, the mux (version <= 42) or raddr (version >= 71) field as
++ * well.
+  */
+-#define MUX_MASK(bot, top) (((1 << (top + 1)) - 1) - ((1 << (bot)) - 1))
+-#define ANYMUX MUX_MASK(0, 7)
++#define OP_MASK(val) BITFIELD64_BIT(val)
++#define OP_RANGE(bot, top) BITFIELD64_RANGE(bot, top - bot + 1)
++#define ANYMUX OP_RANGE(0, 7)
++#define ANYOPMASK OP_RANGE(0, 63)
+ 
+ struct opcode_desc {
+         uint8_t opcode_first;
+         uint8_t opcode_last;
+-        uint8_t mux_b_mask;
+-        uint8_t mux_a_mask;
++
++        union {
++                struct {
++                        uint8_t b_mask;
++                        uint8_t a_mask;
++                } mux;
++                uint64_t raddr_mask;
++        };
++
+         uint8_t op;
+ 
+         /* first_ver == 0 if it's the same across all V3D versions.
+@@ -512,122 +522,288 @@ struct opcode_desc {
+         uint8_t last_ver;
+ };
+ 
+-static const struct opcode_desc add_ops[] = {
++static const struct opcode_desc add_ops_v33[] = {
+         /* FADD is FADDNF depending on the order of the mux_a/mux_b. */
+-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADD },
+-        { 0,   47,  ANYMUX, ANYMUX, V3D_QPU_A_FADDNF },
+-        { 53,  55,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 56,  56,  ANYMUX, ANYMUX, V3D_QPU_A_ADD },
+-        { 57,  59,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 60,  60,  ANYMUX, ANYMUX, V3D_QPU_A_SUB },
+-        { 61,  63,  ANYMUX, ANYMUX, V3D_QPU_A_VFPACK },
+-        { 64,  111, ANYMUX, ANYMUX, V3D_QPU_A_FSUB },
+-        { 120, 120, ANYMUX, ANYMUX, V3D_QPU_A_MIN },
+-        { 121, 121, ANYMUX, ANYMUX, V3D_QPU_A_MAX },
+-        { 122, 122, ANYMUX, ANYMUX, V3D_QPU_A_UMIN },
+-        { 123, 123, ANYMUX, ANYMUX, V3D_QPU_A_UMAX },
+-        { 124, 124, ANYMUX, ANYMUX, V3D_QPU_A_SHL },
+-        { 125, 125, ANYMUX, ANYMUX, V3D_QPU_A_SHR },
+-        { 126, 126, ANYMUX, ANYMUX, V3D_QPU_A_ASR },
+-        { 127, 127, ANYMUX, ANYMUX, V3D_QPU_A_ROR },
++        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADD },
++        { 0,   47,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FADDNF },
++        { 53,  55,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 56,  56,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ADD },
++        { 57,  59,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 60,  60,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SUB },
++        { 61,  63,  .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFPACK },
++        { 64,  111, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FSUB },
++        { 120, 120, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MIN },
++        { 121, 121, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_MAX },
++        { 122, 122, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMIN },
++        { 123, 123, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_UMAX },
++        { 124, 124, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHL },
++        { 125, 125, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_SHR },
++        { 126, 126, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ASR },
++        { 127, 127, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_ROR },
+         /* FMIN is instead FMAX depending on the order of the mux_a/mux_b. */
+-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMIN },
+-        { 128, 175, ANYMUX, ANYMUX, V3D_QPU_A_FMAX },
+-        { 176, 180, ANYMUX, ANYMUX, V3D_QPU_A_VFMIN },
+-
+-        { 181, 181, ANYMUX, ANYMUX, V3D_QPU_A_AND },
+-        { 182, 182, ANYMUX, ANYMUX, V3D_QPU_A_OR },
+-        { 183, 183, ANYMUX, ANYMUX, V3D_QPU_A_XOR },
+-
+-        { 184, 184, ANYMUX, ANYMUX, V3D_QPU_A_VADD },
+-        { 185, 185, ANYMUX, ANYMUX, V3D_QPU_A_VSUB },
+-        { 186, 186, 1 << 0, ANYMUX, V3D_QPU_A_NOT },
+-        { 186, 186, 1 << 1, ANYMUX, V3D_QPU_A_NEG },
+-        { 186, 186, 1 << 2, ANYMUX, V3D_QPU_A_FLAPUSH },
+-        { 186, 186, 1 << 3, ANYMUX, V3D_QPU_A_FLBPUSH },
+-        { 186, 186, 1 << 4, ANYMUX, V3D_QPU_A_FLPOP },
+-        { 186, 186, 1 << 5, ANYMUX, V3D_QPU_A_RECIP },
+-        { 186, 186, 1 << 6, ANYMUX, V3D_QPU_A_SETMSF },
+-        { 186, 186, 1 << 7, ANYMUX, V3D_QPU_A_SETREVF },
+-        { 187, 187, 1 << 0, 1 << 0, V3D_QPU_A_NOP, 0 },
+-        { 187, 187, 1 << 0, 1 << 1, V3D_QPU_A_TIDX },
+-        { 187, 187, 1 << 0, 1 << 2, V3D_QPU_A_EIDX },
+-        { 187, 187, 1 << 0, 1 << 3, V3D_QPU_A_LR },
+-        { 187, 187, 1 << 0, 1 << 4, V3D_QPU_A_VFLA },
+-        { 187, 187, 1 << 0, 1 << 5, V3D_QPU_A_VFLNA },
+-        { 187, 187, 1 << 0, 1 << 6, V3D_QPU_A_VFLB },
+-        { 187, 187, 1 << 0, 1 << 7, V3D_QPU_A_VFLNB },
+-
+-        { 187, 187, 1 << 1, MUX_MASK(0, 2), V3D_QPU_A_FXCD },
+-        { 187, 187, 1 << 1, 1 << 3, V3D_QPU_A_XCD },
+-        { 187, 187, 1 << 1, MUX_MASK(4, 6), V3D_QPU_A_FYCD },
+-        { 187, 187, 1 << 1, 1 << 7, V3D_QPU_A_YCD },
+-
+-        { 187, 187, 1 << 2, 1 << 0, V3D_QPU_A_MSF },
+-        { 187, 187, 1 << 2, 1 << 1, V3D_QPU_A_REVF },
+-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_VDWWT, 33 },
+-        { 187, 187, 1 << 2, 1 << 2, V3D_QPU_A_IID, 40 },
+-        { 187, 187, 1 << 2, 1 << 3, V3D_QPU_A_SAMPID, 40 },
+-        { 187, 187, 1 << 2, 1 << 4, V3D_QPU_A_BARRIERID, 40 },
+-        { 187, 187, 1 << 2, 1 << 5, V3D_QPU_A_TMUWT },
+-        { 187, 187, 1 << 2, 1 << 6, V3D_QPU_A_VPMWT },
+-        { 187, 187, 1 << 2, 1 << 7, V3D_QPU_A_FLAFIRST, 41 },
+-        { 187, 187, 1 << 3, 1 << 0, V3D_QPU_A_FLNAFIRST, 41 },
+-        { 187, 187, 1 << 3, ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
+-
+-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
+-        { 188, 188, 1 << 0, ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
+-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
+-        { 188, 188, 1 << 1, ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
+-        { 188, 188, 1 << 2, ANYMUX, V3D_QPU_A_LDVPMP, 40 },
+-        { 188, 188, 1 << 3, ANYMUX, V3D_QPU_A_RSQRT, 41 },
+-        { 188, 188, 1 << 4, ANYMUX, V3D_QPU_A_EXP, 41 },
+-        { 188, 188, 1 << 5, ANYMUX, V3D_QPU_A_LOG, 41 },
+-        { 188, 188, 1 << 6, ANYMUX, V3D_QPU_A_SIN, 41 },
+-        { 188, 188, 1 << 7, ANYMUX, V3D_QPU_A_RSQRT2, 41 },
+-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
+-        { 189, 189, ANYMUX, ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
++        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMIN },
++        { 128, 175, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FMAX },
++        { 176, 180, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMIN },
++
++        { 181, 181, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_AND },
++        { 182, 182, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_OR },
++        { 183, 183, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_XOR },
++
++        { 184, 184, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VADD },
++        { 185, 185, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VSUB },
++        { 186, 186, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_NOT },
++        { 186, 186, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_NEG },
++        { 186, 186, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_FLAPUSH },
++        { 186, 186, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FLBPUSH },
++        { 186, 186, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_FLPOP },
++        { 186, 186, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_RECIP },
++        { 186, 186, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SETMSF },
++        { 186, 186, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_SETREVF },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(1), V3D_QPU_A_TIDX },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(2), V3D_QPU_A_EIDX },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(3), V3D_QPU_A_LR },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(4), V3D_QPU_A_VFLA },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VFLB },
++        { 187, 187, .mux.b_mask = OP_MASK(0), .mux.a_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
++
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(0, 2), V3D_QPU_A_FXCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(3), V3D_QPU_A_XCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_RANGE(4, 6), V3D_QPU_A_FYCD },
++        { 187, 187, .mux.b_mask = OP_MASK(1), .mux.a_mask = OP_MASK(7), V3D_QPU_A_YCD },
++
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(0), V3D_QPU_A_MSF },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(1), V3D_QPU_A_REVF },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_VDWWT, 33 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(2), V3D_QPU_A_IID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(3), V3D_QPU_A_SAMPID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(4), V3D_QPU_A_BARRIERID, 40 },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(5), V3D_QPU_A_TMUWT },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(6), V3D_QPU_A_VPMWT },
++        { 187, 187, .mux.b_mask = OP_MASK(2), .mux.a_mask = OP_MASK(7), V3D_QPU_A_FLAFIRST, 41 },
++        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = OP_MASK(0), V3D_QPU_A_FLNAFIRST, 41 },
++        { 187, 187, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_VPMSETUP, 33 },
++
++        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_IN, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(0), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMV_OUT, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_IN, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(1), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMD_OUT, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(2), .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMP, 40 },
++        { 188, 188, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(4), .mux.a_mask = ANYMUX, V3D_QPU_A_EXP, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(5), .mux.a_mask = ANYMUX, V3D_QPU_A_LOG, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(6), .mux.a_mask = ANYMUX, V3D_QPU_A_SIN, 41 },
++        { 188, 188, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_RSQRT2, 41 },
++        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_IN, 40 },
++        { 189, 189, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_LDVPMG_OUT, 40 },
+ 
+         /* FIXME: MORE COMPLICATED */
+-        /* { 190, 191, ANYMUX, ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
++        /* { 190, 191, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMOVABSNEGNAB }, */
+ 
+-        { 192, 239, ANYMUX, ANYMUX, V3D_QPU_A_FCMP },
+-        { 240, 244, ANYMUX, ANYMUX, V3D_QPU_A_VFMAX },
++        { 192, 239, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_FCMP },
++        { 240, 244, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_VFMAX },
+ 
+-        { 245, 245, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FROUND },
+-        { 245, 245, 1 << 3, ANYMUX, V3D_QPU_A_FTOIN },
+-        { 245, 245, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FTRUNC },
+-        { 245, 245, 1 << 7, ANYMUX, V3D_QPU_A_FTOIZ },
+-        { 246, 246, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FFLOOR },
+-        { 246, 246, 1 << 3, ANYMUX, V3D_QPU_A_FTOUZ },
+-        { 246, 246, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FCEIL },
+-        { 246, 246, 1 << 7, ANYMUX, V3D_QPU_A_FTOC },
++        { 245, 245, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FROUND },
++        { 245, 245, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIN },
++        { 245, 245, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FTRUNC },
++        { 245, 245, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOIZ },
++        { 246, 246, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FFLOOR },
++        { 246, 246, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOUZ },
++        { 246, 246, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FCEIL },
++        { 246, 246, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_A_FTOC },
+ 
+-        { 247, 247, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_FDX },
+-        { 247, 247, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_FDY },
++        { 247, 247, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_FDX },
++        { 247, 247, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_FDY },
+ 
+         /* The stvpms are distinguished by the waddr field. */
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMV },
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMD },
+-        { 248, 248, ANYMUX, ANYMUX, V3D_QPU_A_STVPMP },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMV },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMD },
++        { 248, 248, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_A_STVPMP },
++
++        { 252, 252, .mux.b_mask = OP_RANGE(0, 2), .mux.a_mask = ANYMUX, V3D_QPU_A_ITOF },
++        { 252, 252, .mux.b_mask = OP_MASK(3), .mux.a_mask = ANYMUX, V3D_QPU_A_CLZ },
++        { 252, 252, .mux.b_mask = OP_RANGE(4, 6), .mux.a_mask = ANYMUX, V3D_QPU_A_UTOF },
++};
++
++static const struct opcode_desc mul_ops_v33[] = {
++        { 1, 1, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_ADD },
++        { 2, 2, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SUB },
++        { 3, 3, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_UMUL24 },
++        { 4, 8, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_VFMUL },
++        { 9, 9, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_SMUL24 },
++        { 10, 10, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_MULTOP },
++        { 14, 14, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMOV, 33, 42 },
++        { 15, 15, .mux.b_mask = OP_RANGE(0, 3), ANYMUX, V3D_QPU_M_FMOV, 33, 42},
++        { 15, 15, .mux.b_mask = OP_MASK(4), .mux.a_mask = OP_MASK(0), V3D_QPU_M_NOP, 33, 42 },
++        { 15, 15, .mux.b_mask = OP_MASK(7), .mux.a_mask = ANYMUX, V3D_QPU_M_MOV, 33, 42 },
++
++        { 16, 63, .mux.b_mask = ANYMUX, .mux.a_mask = ANYMUX, V3D_QPU_M_FMUL },
++};
++
++/* Note that it would have been possible to define all the add/mul opcodes in
++ * just one table, using the first_ver/last_ver. But taking into account that
++ * for v71 there were a lot of changes, it was more tidy this way. Also right
++ * now we are doing a linear search on those tables, so this maintains the
++ * tables smaller.
++ *
++ * Just in case we merge the tables, we define the first_ver as 71 for those
++ * opcodes that changed on v71
++ */
++static const struct opcode_desc add_ops_v71[] = {
++        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
++        { 53,  55,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 56,  56,  .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
++        { 57,  59,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 60,  60,  .raddr_mask = ANYOPMASK, V3D_QPU_A_SUB },
++        { 61,  63,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
++        { 64,  111, .raddr_mask = ANYOPMASK, V3D_QPU_A_FSUB },
++        { 120, 120, .raddr_mask = ANYOPMASK, V3D_QPU_A_MIN },
++        { 121, 121, .raddr_mask = ANYOPMASK, V3D_QPU_A_MAX },
++        { 122, 122, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMIN },
++        { 123, 123, .raddr_mask = ANYOPMASK, V3D_QPU_A_UMAX },
++        { 124, 124, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHL },
++        { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
++        { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
++        { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
++
++        { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
++        { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
++        { 183, 183, .raddr_mask = ANYOPMASK, V3D_QPU_A_XOR },
++        { 184, 184, .raddr_mask = ANYOPMASK, V3D_QPU_A_VADD },
++        { 185, 185, .raddr_mask = ANYOPMASK, V3D_QPU_A_VSUB },
++
++        { 186, 186, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOT },
++        { 186, 186, .raddr_mask = OP_MASK(1), V3D_QPU_A_NEG },
++        { 186, 186, .raddr_mask = OP_MASK(2), V3D_QPU_A_FLAPUSH },
++        { 186, 186, .raddr_mask = OP_MASK(3), V3D_QPU_A_FLBPUSH },
++        { 186, 186, .raddr_mask = OP_MASK(4), V3D_QPU_A_FLPOP },
++        { 186, 186, .raddr_mask = OP_MASK(5), V3D_QPU_A_CLZ },
++        { 186, 186, .raddr_mask = OP_MASK(6), V3D_QPU_A_SETMSF },
++        { 186, 186, .raddr_mask = OP_MASK(7), V3D_QPU_A_SETREVF },
++
++        { 187, 187, .raddr_mask = OP_MASK(0), V3D_QPU_A_NOP, 0 },
++        { 187, 187, .raddr_mask = OP_MASK(1), V3D_QPU_A_TIDX },
++        { 187, 187, .raddr_mask = OP_MASK(2), V3D_QPU_A_EIDX },
++        { 187, 187, .raddr_mask = OP_MASK(3), V3D_QPU_A_LR },
++        { 187, 187, .raddr_mask = OP_MASK(4), V3D_QPU_A_VFLA },
++        { 187, 187, .raddr_mask = OP_MASK(5), V3D_QPU_A_VFLNA },
++        { 187, 187, .raddr_mask = OP_MASK(6), V3D_QPU_A_VFLB },
++        { 187, 187, .raddr_mask = OP_MASK(7), V3D_QPU_A_VFLNB },
++        { 187, 187, .raddr_mask = OP_MASK(8), V3D_QPU_A_XCD },
++        { 187, 187, .raddr_mask = OP_MASK(9), V3D_QPU_A_YCD },
++        { 187, 187, .raddr_mask = OP_MASK(10), V3D_QPU_A_MSF },
++        { 187, 187, .raddr_mask = OP_MASK(11), V3D_QPU_A_REVF },
++        { 187, 187, .raddr_mask = OP_MASK(12), V3D_QPU_A_IID },
++        { 187, 187, .raddr_mask = OP_MASK(13), V3D_QPU_A_SAMPID },
++        { 187, 187, .raddr_mask = OP_MASK(14), V3D_QPU_A_BARRIERID },
++        { 187, 187, .raddr_mask = OP_MASK(15), V3D_QPU_A_TMUWT },
++        { 187, 187, .raddr_mask = OP_MASK(16), V3D_QPU_A_VPMWT },
++        { 187, 187, .raddr_mask = OP_MASK(17), V3D_QPU_A_FLAFIRST },
++        { 187, 187, .raddr_mask = OP_MASK(18), V3D_QPU_A_FLNAFIRST },
++
++        { 187, 187, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FXCD },
++        { 187, 187, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FYCD },
++
++        { 188, 188, .raddr_mask = OP_MASK(0), V3D_QPU_A_LDVPMV_IN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(1), V3D_QPU_A_LDVPMD_IN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(2), V3D_QPU_A_LDVPMP, 71 },
++
++        { 188, 188, .raddr_mask = OP_MASK(32), V3D_QPU_A_RECIP, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(33), V3D_QPU_A_RSQRT, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(34), V3D_QPU_A_EXP, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(35), V3D_QPU_A_LOG, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(36), V3D_QPU_A_SIN, 71 },
++        { 188, 188, .raddr_mask = OP_MASK(37), V3D_QPU_A_RSQRT2, 71 },
++
++        { 189, 189, .raddr_mask = ANYOPMASK, V3D_QPU_A_LDVPMG_IN, 71 },
+ 
+-        { 252, 252, MUX_MASK(0, 2), ANYMUX, V3D_QPU_A_ITOF },
+-        { 252, 252, 1 << 3, ANYMUX, V3D_QPU_A_CLZ },
+-        { 252, 252, MUX_MASK(4, 6), ANYMUX, V3D_QPU_A_UTOF },
++        /* The stvpms are distinguished by the waddr field. */
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMV, 71},
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMD, 71},
++        { 190, 190, .raddr_mask = ANYOPMASK, V3D_QPU_A_STVPMP, 71},
++
++        { 192, 207, .raddr_mask = ANYOPMASK, V3D_QPU_A_FCMP, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FROUND, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FROUND, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(3),  V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(7),  V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(11), V3D_QPU_A_FTOIN, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(15), V3D_QPU_A_FTOIN, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FTRUNC, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FTRUNC, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(19), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(23), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(27), V3D_QPU_A_FTOIZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(31), V3D_QPU_A_FTOIZ, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(40, 42), V3D_QPU_A_FFLOOR, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(44, 46), V3D_QPU_A_FFLOOR, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(35), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(39), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(43), V3D_QPU_A_FTOUZ, 71 },
++        { 245, 245, .raddr_mask = OP_MASK(47), V3D_QPU_A_FTOUZ, 71 },
++
++        { 245, 245, .raddr_mask = OP_RANGE(48, 50), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(52, 54), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(56, 58), V3D_QPU_A_FCEIL, 71 },
++        { 245, 245, .raddr_mask = OP_RANGE(60, 62), V3D_QPU_A_FCEIL, 71 },
++
++        { 245, 245, .raddr_mask = OP_MASK(51), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(55), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(59), V3D_QPU_A_FTOC },
++        { 245, 245, .raddr_mask = OP_MASK(63), V3D_QPU_A_FTOC },
++
++        { 246, 246, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FDX, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FDY, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(28, 30), V3D_QPU_A_FDY, 71 },
++
++        { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
++        { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+ };
+ 
+-static const struct opcode_desc mul_ops[] = {
+-        { 1, 1, ANYMUX, ANYMUX, V3D_QPU_M_ADD },
+-        { 2, 2, ANYMUX, ANYMUX, V3D_QPU_M_SUB },
+-        { 3, 3, ANYMUX, ANYMUX, V3D_QPU_M_UMUL24 },
+-        { 4, 8, ANYMUX, ANYMUX, V3D_QPU_M_VFMUL },
+-        { 9, 9, ANYMUX, ANYMUX, V3D_QPU_M_SMUL24 },
+-        { 10, 10, ANYMUX, ANYMUX, V3D_QPU_M_MULTOP },
+-        { 14, 14, ANYMUX, ANYMUX, V3D_QPU_M_FMOV },
+-        { 15, 15, MUX_MASK(0, 3), ANYMUX, V3D_QPU_M_FMOV },
+-        { 15, 15, 1 << 4, 1 << 0, V3D_QPU_M_NOP, 0 },
+-        { 15, 15, 1 << 7, ANYMUX, V3D_QPU_M_MOV },
+-        { 16, 63, ANYMUX, ANYMUX, V3D_QPU_M_FMUL },
++static const struct opcode_desc mul_ops_v71[] = {
++        /* For V3D 7.1, second mask field would be ignored */
++        { 1, 1, .raddr_mask = ANYOPMASK, V3D_QPU_M_ADD, 71 },
++        { 2, 2, .raddr_mask = ANYOPMASK, V3D_QPU_M_SUB, 71 },
++        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
++        { 3, 3, .raddr_mask = ANYOPMASK, V3D_QPU_M_UMUL24, 71 },
++        { 4, 8, .raddr_mask = ANYOPMASK, V3D_QPU_M_VFMUL, 71 },
++        { 9, 9, .raddr_mask = ANYOPMASK, V3D_QPU_M_SMUL24, 71 },
++        { 10, 10, .raddr_mask = ANYOPMASK, V3D_QPU_M_MULTOP, 71 },
++
++        { 14, 14, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_M_FMOV, 71 },
++        { 14, 14, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_M_FMOV, 71 },
++
++        { 14, 14, .raddr_mask = OP_MASK(3),  V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(7),  V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(11), V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
++
++        { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
++
++        { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
+ };
+ 
+ /* Returns true if op_desc should be filtered out based on devinfo->ver
+@@ -636,17 +812,23 @@ static const struct opcode_desc mul_ops[] = {
+  */
+ static bool
+ opcode_invalid_in_version(const struct v3d_device_info *devinfo,
+-                          const struct opcode_desc *op_desc)
++                          const uint8_t first_ver,
++                          const uint8_t last_ver)
+ {
+-        return (op_desc->first_ver != 0 && devinfo->ver < op_desc->first_ver) ||
+-                (op_desc->last_ver != 0  && devinfo->ver > op_desc->last_ver);
++        return (first_ver != 0 && devinfo->ver < first_ver) ||
++                (last_ver != 0  && devinfo->ver > last_ver);
+ }
+ 
++/* Note that we pass as parameters mux_a, mux_b and raddr, even if depending
++ * on the devinfo->ver some would be ignored. We do this way just to avoid
++ * having two really similar lookup_opcode methods
++ */
+ static const struct opcode_desc *
+ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
+                           const struct opcode_desc *opcodes,
+                           size_t num_opcodes, uint32_t opcode,
+-                          uint32_t mux_a, uint32_t mux_b)
++                          uint32_t mux_a, uint32_t mux_b,
++                          uint32_t raddr)
+ {
+         for (int i = 0; i < num_opcodes; i++) {
+                 const struct opcode_desc *op_desc = &opcodes[i];
+@@ -655,14 +837,19 @@ lookup_opcode_from_packed(const struct v3d_device_info *devinfo,
+                     opcode > op_desc->opcode_last)
+                         continue;
+ 
+-                if (opcode_invalid_in_version(devinfo, op_desc))
++                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+                         continue;
+ 
+-                if (!(op_desc->mux_b_mask & (1 << mux_b)))
+-                        continue;
++                if (devinfo->ver < 71) {
++                        if (!(op_desc->mux.b_mask & (1 << mux_b)))
++                                continue;
+ 
+-                if (!(op_desc->mux_a_mask & (1 << mux_a)))
+-                        continue;
++                        if (!(op_desc->mux.a_mask & (1 << mux_a)))
++                                continue;
++                } else {
++                        if (!(op_desc->raddr_mask & ((uint64_t) 1 << raddr)))
++                                continue;
++                }
+ 
+                 return op_desc;
+         }
+@@ -784,8 +971,8 @@ v3d_qpu_float32_pack_pack(enum v3d_qpu_output_pack pack,
+ }
+ 
+ static bool
+-v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+-                   struct v3d_qpu_instr *instr)
++v3d33_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
+ {
+         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
+         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_ADD_A);
+@@ -802,8 +989,9 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                 map_op = (map_op - 253 + 245);
+ 
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_packed(devinfo, add_ops, ARRAY_SIZE(add_ops),
+-                                          map_op, mux_a, mux_b);
++                lookup_opcode_from_packed(devinfo, add_ops_v33,
++                                          ARRAY_SIZE(add_ops_v33),
++                                          map_op, mux_a, mux_b, 0);
+ 
+         if (!desc)
+                 return false;
+@@ -939,8 +1127,160 @@ v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ }
+ 
+ static bool
+-v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
++{
++        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_ADD);
++        uint32_t raddr_a = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_A);
++        uint32_t raddr_b = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_B);
++        uint32_t waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
++        uint32_t map_op = op;
++
++        const struct opcode_desc *desc =
++                lookup_opcode_from_packed(devinfo,
++                                          add_ops_v71,
++                                          ARRAY_SIZE(add_ops_v71),
++                                          map_op, 0, 0,
++                                          raddr_b);
++        if (!desc)
++                return false;
++
++        instr->alu.add.op = desc->op;
++
++        /* Some QPU ops require a bit more than just basic opcode and mux a/b
++         * comparisons to distinguish them.
++         */
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_STVPMV:
++        case V3D_QPU_A_STVPMD:
++        case V3D_QPU_A_STVPMP:
++                switch (waddr) {
++                case 0:
++                        instr->alu.add.op = V3D_QPU_A_STVPMV;
++                        break;
++                case 1:
++                        instr->alu.add.op = V3D_QPU_A_STVPMD;
++                        break;
++                case 2:
++                        instr->alu.add.op = V3D_QPU_A_STVPMP;
++                        break;
++                default:
++                        return false;
++                }
++                break;
++        default:
++                break;
++        }
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_FADD:
++        case V3D_QPU_A_FADDNF:
++        case V3D_QPU_A_FSUB:
++        case V3D_QPU_A_FMIN:
++        case V3D_QPU_A_FMAX:
++        case V3D_QPU_A_FCMP:
++        case V3D_QPU_A_VFPACK:
++                if (instr->alu.add.op != V3D_QPU_A_VFPACK &&
++                    instr->alu.add.op != V3D_QPU_A_FCMP) {
++                        instr->alu.add.output_pack = (op >> 4) & 0x3;
++                } else {
++                        instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
++                                                   &instr->alu.add.b.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_FFLOOR:
++        case V3D_QPU_A_FROUND:
++        case V3D_QPU_A_FTRUNC:
++        case V3D_QPU_A_FCEIL:
++        case V3D_QPU_A_FDX:
++        case V3D_QPU_A_FDY:
++                instr->alu.add.output_pack = raddr_b & 0x3;
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_FTOIN:
++        case V3D_QPU_A_FTOIZ:
++        case V3D_QPU_A_FTOUZ:
++        case V3D_QPU_A_FTOC:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_float32_unpack_unpack((raddr_b >> 2) & 0x3,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
++        case V3D_QPU_A_VFMIN:
++        case V3D_QPU_A_VFMAX:
++                unreachable("pending v71 update");
++                if (!v3d_qpu_float16_unpack_unpack(op & 0x7,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++
++        default:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++        }
++
++        instr->alu.add.a.raddr = raddr_a;
++        instr->alu.add.b.raddr = raddr_b;
++        instr->alu.add.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_A);
++
++        instr->alu.add.magic_write = false;
++        if (packed_inst & V3D_QPU_MA) {
++                switch (instr->alu.add.op) {
++                case V3D_QPU_A_LDVPMV_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMV_OUT;
++                        break;
++                case V3D_QPU_A_LDVPMD_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMD_OUT;
++                        break;
++                case V3D_QPU_A_LDVPMG_IN:
++                        instr->alu.add.op = V3D_QPU_A_LDVPMG_OUT;
++                        break;
++                default:
++                        instr->alu.add.magic_write = true;
++                        break;
++                }
++        }
++
++        return true;
++}
++
++static bool
++v3d_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+                    struct v3d_qpu_instr *instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_add_unpack(devinfo, packed_inst, instr);
++        else
++                return v3d71_qpu_add_unpack(devinfo, packed_inst, instr);
++}
++
++static bool
++v3d33_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
+ {
+         uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
+         uint32_t mux_a = QPU_GET_FIELD(packed_inst, V3D_QPU_MUL_A);
+@@ -948,9 +1288,10 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+ 
+         {
+                 const struct opcode_desc *desc =
+-                        lookup_opcode_from_packed(devinfo, mul_ops,
+-                                                  ARRAY_SIZE(mul_ops),
+-                                                  op, mux_a, mux_b);
++                        lookup_opcode_from_packed(devinfo,
++                                                  mul_ops_v33,
++                                                  ARRAY_SIZE(mul_ops_v33),
++                                                  op, mux_a, mux_b, 0);
+                 if (!desc)
+                         return false;
+ 
+@@ -1011,6 +1352,91 @@ v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
+         return true;
+ }
+ 
++static bool
++v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                     struct v3d_qpu_instr *instr)
++{
++        uint32_t op = QPU_GET_FIELD(packed_inst, V3D_QPU_OP_MUL);
++        uint32_t raddr_c = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_C);
++        uint32_t raddr_d = QPU_GET_FIELD(packed_inst, V3D_QPU_RADDR_D);
++
++        {
++                const struct opcode_desc *desc =
++                        lookup_opcode_from_packed(devinfo,
++                                                  mul_ops_v71,
++                                                  ARRAY_SIZE(mul_ops_v71),
++                                                  op, 0, 0,
++                                                  raddr_d);
++                if (!desc)
++                        return false;
++
++                instr->alu.mul.op = desc->op;
++        }
++
++        switch (instr->alu.mul.op) {
++        case V3D_QPU_M_FMUL:
++                instr->alu.mul.output_pack = ((op >> 4) & 0x3) - 1;
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 2) & 0x3,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_unpack((op >> 0) & 0x3,
++                                                   &instr->alu.mul.b.unpack)) {
++                        return false;
++                }
++
++                break;
++
++        case V3D_QPU_M_FMOV:
++                instr->alu.mul.output_pack = (raddr_d >> 2) & 1;
++
++                if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                break;
++
++        case V3D_QPU_M_VFMUL:
++                unreachable("pending v71 update");
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_float16_unpack_unpack(((op & 0x7) - 4) & 7,
++                                                   &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++
++                break;
++
++        default:
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++                instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++                instr->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++                break;
++        }
++
++        instr->alu.mul.a.raddr = raddr_c;
++        instr->alu.mul.b.raddr = raddr_d;
++        instr->alu.mul.waddr = QPU_GET_FIELD(packed_inst, V3D_QPU_WADDR_M);
++        instr->alu.mul.magic_write = packed_inst & V3D_QPU_MM;
++
++        return true;
++}
++
++static bool
++v3d_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst,
++                   struct v3d_qpu_instr *instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_mul_unpack(devinfo, packed_inst, instr);
++        else
++                return v3d71_qpu_mul_unpack(devinfo, packed_inst, instr);
++}
++
+ static const struct opcode_desc *
+ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+                          const struct opcode_desc *opcodes, size_t num_opcodes,
+@@ -1022,7 +1448,7 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+                 if (op_desc->op != op)
+                         continue;
+ 
+-                if (opcode_invalid_in_version(devinfo, op_desc))
++                if (opcode_invalid_in_version(devinfo, op_desc->first_ver, op_desc->last_ver))
+                         continue;
+ 
+                 return op_desc;
+@@ -1032,30 +1458,31 @@ lookup_opcode_from_instr(const struct v3d_device_info *devinfo,
+ }
+ 
+ static bool
+-v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++v3d33_qpu_add_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+         uint32_t waddr = instr->alu.add.waddr;
+         uint32_t mux_a = instr->alu.add.a.mux;
+         uint32_t mux_b = instr->alu.add.b.mux;
+         int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_instr(devinfo, add_ops, ARRAY_SIZE(add_ops),
++                lookup_opcode_from_instr(devinfo, add_ops_v33,
++                                         ARRAY_SIZE(add_ops_v33),
+                                          instr->alu.add.op);
+ 
+         if (!desc)
+                 return false;
+ 
+-        uint32_t opcode = desc->opcode_first;
++        uint32_t opcode = opcode = desc->opcode_first;
+ 
+         /* If an operation doesn't use an arg, its mux values may be used to
+          * identify the operation type.
+          */
+         if (nsrc < 2)
+-                mux_b = ffs(desc->mux_b_mask) - 1;
++                mux_b = ffs(desc->mux.b_mask) - 1;
+ 
+         if (nsrc < 1)
+-                mux_a = ffs(desc->mux_a_mask) - 1;
++                mux_a = ffs(desc->mux.a_mask) - 1;
+ 
+         bool no_magic_write = false;
+ 
+@@ -1162,8 +1589,8 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                         return false;
+                 }
+ 
+-                opcode = (opcode & ~(1 << 2)) | (a_unpack << 2);
+-                opcode = (opcode & ~(1 << 0)) | (b_unpack << 0);
++                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
++                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
+ 
+                 break;
+         }
+@@ -1188,7 +1615,7 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 }
+                 if (packed == 0)
+                         return false;
+-                opcode = (opcode & ~(1 << 2)) | packed << 2;
++                opcode = (opcode & ~(0x3 << 2)) | packed << 2;
+                 break;
+         }
+ 
+@@ -1245,15 +1672,211 @@ v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
+ }
+ 
+ static bool
+-v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+-                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        uint32_t waddr = instr->alu.add.waddr;
++        uint32_t raddr_a = instr->alu.add.a.raddr;
++        uint32_t raddr_b = instr->alu.add.b.raddr;
++
++        int nsrc = v3d_qpu_add_op_num_src(instr->alu.add.op);
++        const struct opcode_desc *desc =
++                lookup_opcode_from_instr(devinfo, add_ops_v71,
++                                         ARRAY_SIZE(add_ops_v71),
++                                         instr->alu.add.op);
++        if (!desc)
++                return false;
++
++        uint32_t opcode = opcode = desc->opcode_first;
++
++        /* If an operation doesn't use an arg, its raddr values may be used to
++         * identify the operation type.
++         */
++        if (nsrc < 2)
++                raddr_b = ffsll(desc->raddr_mask) - 1;
++
++        bool no_magic_write = false;
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_STVPMV:
++                waddr = 0;
++                no_magic_write = true;
++                break;
++        case V3D_QPU_A_STVPMD:
++                waddr = 1;
++                no_magic_write = true;
++                break;
++        case V3D_QPU_A_STVPMP:
++                waddr = 2;
++                no_magic_write = true;
++                break;
++
++        case V3D_QPU_A_LDVPMV_IN:
++        case V3D_QPU_A_LDVPMD_IN:
++        case V3D_QPU_A_LDVPMP:
++        case V3D_QPU_A_LDVPMG_IN:
++                assert(!instr->alu.add.magic_write);
++                break;
++
++        case V3D_QPU_A_LDVPMV_OUT:
++        case V3D_QPU_A_LDVPMD_OUT:
++        case V3D_QPU_A_LDVPMG_OUT:
++                assert(!instr->alu.add.magic_write);
++                *packed_instr |= V3D_QPU_MA;
++                break;
++
++        default:
++                break;
++        }
++
++        switch (instr->alu.add.op) {
++        case V3D_QPU_A_FADD:
++        case V3D_QPU_A_FADDNF:
++        case V3D_QPU_A_FSUB:
++        case V3D_QPU_A_FMIN:
++        case V3D_QPU_A_FMAX:
++        case V3D_QPU_A_FCMP: {
++                uint32_t output_pack;
++                uint32_t a_unpack;
++                uint32_t b_unpack;
++
++                if (instr->alu.add.op != V3D_QPU_A_FCMP) {
++                        if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                                       &output_pack)) {
++                                return false;
++                        }
++                        opcode |= output_pack << 4;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &a_unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
++                                                 &b_unpack)) {
++                        return false;
++                }
++
++                opcode |= a_unpack << 2;
++                opcode |= b_unpack << 0;
++
++                break;
++        }
++
++        case V3D_QPU_A_VFPACK: {
++                uint32_t a_unpack;
++                uint32_t b_unpack;
++
++                if (instr->alu.add.a.unpack == V3D_QPU_UNPACK_ABS ||
++                    instr->alu.add.b.unpack == V3D_QPU_UNPACK_ABS) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &a_unpack)) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.b.unpack,
++                                                 &b_unpack)) {
++                        return false;
++                }
++
++                opcode = (opcode & ~(0x3 << 2)) | (a_unpack << 2);
++                opcode = (opcode & ~(0x3 << 0)) | (b_unpack << 0);
++
++                break;
++        }
++
++        case V3D_QPU_A_FFLOOR:
++        case V3D_QPU_A_FROUND:
++        case V3D_QPU_A_FTRUNC:
++        case V3D_QPU_A_FCEIL:
++        case V3D_QPU_A_FDX:
++        case V3D_QPU_A_FDY: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                raddr_b |= packed;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (packed == 0)
++                        return false;
++                raddr_b = (raddr_b & ~(0x3 << 2)) | packed << 2;
++                break;
++        }
++
++        case V3D_QPU_A_FTOIN:
++        case V3D_QPU_A_FTOIZ:
++        case V3D_QPU_A_FTOUZ:
++        case V3D_QPU_A_FTOC:
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                uint32_t packed;
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (packed == 0)
++                        return false;
++
++                raddr_b |= (raddr_b & ~(0x3 << 2)) | packed << 2;
++
++                break;
++
++        case V3D_QPU_A_VFMIN:
++        case V3D_QPU_A_VFMAX:
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
++                    instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE) {
++                        return false;
++                }
++
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed;
++                break;
++
++        default:
++                if (instr->alu.add.op != V3D_QPU_A_NOP &&
++                    (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.add.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.add.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
++                break;
++        }
++
++        *packed_instr |= QPU_SET_FIELD(raddr_a, V3D_QPU_RADDR_A);
++        *packed_instr |= QPU_SET_FIELD(raddr_b, V3D_QPU_RADDR_B);
++        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_ADD);
++        *packed_instr |= QPU_SET_FIELD(waddr, V3D_QPU_WADDR_A);
++        if (instr->alu.add.magic_write && !no_magic_write)
++                *packed_instr |= V3D_QPU_MA;
++
++        return true;
++}
++
++static bool
++v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
+ {
+         uint32_t mux_a = instr->alu.mul.a.mux;
+         uint32_t mux_b = instr->alu.mul.b.mux;
+         int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
+ 
+         const struct opcode_desc *desc =
+-                lookup_opcode_from_instr(devinfo, mul_ops, ARRAY_SIZE(mul_ops),
++                lookup_opcode_from_instr(devinfo, mul_ops_v33,
++                                         ARRAY_SIZE(mul_ops_v33),
+                                          instr->alu.mul.op);
+ 
+         if (!desc)
+@@ -1265,10 +1888,10 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+          * that here.  If mux a/b determine packing, it will be set below.
+          */
+         if (nsrc < 2)
+-                mux_b = ffs(desc->mux_b_mask) - 1;
++                mux_b = ffs(desc->mux.b_mask) - 1;
+ 
+         if (nsrc < 1)
+-                mux_a = ffs(desc->mux_a_mask) - 1;
++                mux_a = ffs(desc->mux.a_mask) - 1;
+ 
+         switch (instr->alu.mul.op) {
+         case V3D_QPU_M_FMUL: {
+@@ -1351,6 +1974,130 @@ v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
+         return true;
+ }
+ 
++static bool
++v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                   const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        uint32_t raddr_c = instr->alu.mul.a.raddr;
++        uint32_t raddr_d = instr->alu.mul.b.raddr;
++        int nsrc = v3d_qpu_mul_op_num_src(instr->alu.mul.op);
++
++        const struct opcode_desc *desc =
++                lookup_opcode_from_instr(devinfo, mul_ops_v71,
++                                         ARRAY_SIZE(mul_ops_v71),
++                                         instr->alu.mul.op);
++        if (!desc)
++                return false;
++
++        uint32_t opcode = desc->opcode_first;
++
++        /* Some opcodes have a single valid value for their raddr_d, so set
++         * that here.  If raddr_b determine packing, it will be set below.
++         */
++        if (nsrc < 2)
++                raddr_d = ffsll(desc->raddr_mask) - 1;
++
++        switch (instr->alu.mul.op) {
++        case V3D_QPU_M_FMUL: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                /* No need for a +1 because desc->opcode_first has a 1 in this
++                 * field.
++                 */
++                opcode += packed << 4;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed << 2;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.b.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                opcode |= packed << 0;
++                break;
++        }
++
++        case V3D_QPU_M_FMOV: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.mul.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                opcode |= (packed >> 1) & 1;
++                raddr_d = (packed & 1) << 2;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                raddr_d |= packed;
++                break;
++        }
++
++        case V3D_QPU_M_VFMUL: {
++                unreachable("pending v71 update");
++                uint32_t packed;
++
++                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_float16_unpack_pack(instr->alu.mul.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                if (instr->alu.mul.a.unpack == V3D_QPU_UNPACK_SWAP_16)
++                        opcode = 8;
++                else
++                        opcode |= (packed + 4) & 7;
++
++                if (instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)
++                        return false;
++
++                break;
++        }
++
++        default:
++                break;
++        }
++
++        *packed_instr |= QPU_SET_FIELD(raddr_c, V3D_QPU_RADDR_C);
++        *packed_instr |= QPU_SET_FIELD(raddr_d, V3D_QPU_RADDR_D);
++        *packed_instr |= QPU_SET_FIELD(opcode, V3D_QPU_OP_MUL);
++        *packed_instr |= QPU_SET_FIELD(instr->alu.mul.waddr, V3D_QPU_WADDR_M);
++        if (instr->alu.mul.magic_write)
++                *packed_instr |= V3D_QPU_MM;
++
++        return true;
++}
++
++static bool
++v3d_qpu_add_pack(const struct v3d_device_info *devinfo,
++                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_add_pack(devinfo, instr, packed_instr);
++        else
++                return v3d71_qpu_add_pack(devinfo, instr, packed_instr);
++}
++
++static bool
++v3d_qpu_mul_pack(const struct v3d_device_info *devinfo,
++                 const struct v3d_qpu_instr *instr, uint64_t *packed_instr)
++{
++        if (devinfo->ver < 71)
++                return v3d33_qpu_mul_pack(devinfo, instr, packed_instr);
++        else
++                return v3d71_qpu_mul_pack(devinfo, instr, packed_instr);
++}
++
+ static bool
+ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
+                          uint64_t packed_instr,
+@@ -1379,8 +2126,14 @@ v3d_qpu_instr_unpack_alu(const struct v3d_device_info *devinfo,
+                         return false;
+         }
+ 
+-        instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
+-        instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
++        if (devinfo->ver <= 71) {
++                /*
++                 * For v71 this will be set on add/mul unpack, as raddr are now
++                 * part of v3d_qpu_input
++                 */
++                instr->raddr_a = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_A);
++                instr->raddr_b = QPU_GET_FIELD(packed_instr, V3D_QPU_RADDR_B);
++        }
+ 
+         if (!v3d_qpu_add_unpack(devinfo, packed_instr, instr))
+                 return false;
+@@ -1466,8 +2219,14 @@ v3d_qpu_instr_pack_alu(const struct v3d_device_info *devinfo,
+         *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
+ 
+         if (instr->type == V3D_QPU_INSTR_TYPE_ALU) {
+-                *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
+-                *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
++                if (devinfo->ver < 71) {
++                        /*
++                         * For v71 this will be set on add/mul unpack, as raddr are now
++                         * part of v3d_qpu_input
++                         */
++                        *packed_instr |= QPU_SET_FIELD(instr->raddr_a, V3D_QPU_RADDR_A);
++                        *packed_instr |= QPU_SET_FIELD(instr->raddr_b, V3D_QPU_RADDR_B);
++                }
+ 
+                 if (!v3d_qpu_add_pack(devinfo, instr, packed_instr))
+                         return false;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch
new file mode 100644
index 0000000000..0bf1274d45
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0017-broadcom-compiler-update-node-temp-translation-for-v.patch
@@ -0,0 +1,261 @@
+From ebba9019461083687f6afd23ff0d4646c1a667cb Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Sun, 29 Jan 2023 00:27:11 +0100
+Subject: [PATCH 017/142] broadcom/compiler: update node/temp translation for
+ v71
+
+As the offset applied needs to take into account if we have
+accumulators or not.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 68 +++++++++----------
+ 1 file changed, 34 insertions(+), 34 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index b22f915d1df..aa9473d124b 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -39,30 +39,31 @@
+                            CLASS_BITS_R5)
+ 
+ static inline uint32_t
+-temp_to_node(uint32_t temp)
++temp_to_node(struct v3d_compile *c, uint32_t temp)
+ {
+-        return temp + ACC_COUNT;
++        return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
+ }
+ 
+ static inline uint32_t
+-node_to_temp(uint32_t node)
++node_to_temp(struct v3d_compile *c, uint32_t node)
+ {
+-        assert(node >= ACC_COUNT);
+-        return node - ACC_COUNT;
++        assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
++               (!c->devinfo->has_accumulators && node >= 0));
++        return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
+ }
+ 
+ static inline uint8_t
+-get_temp_class_bits(struct v3d_ra_node_info *nodes,
++get_temp_class_bits(struct v3d_compile *c,
+                     uint32_t temp)
+ {
+-        return nodes->info[temp_to_node(temp)].class_bits;
++        return c->nodes.info[temp_to_node(c, temp)].class_bits;
+ }
+ 
+ static inline void
+-set_temp_class_bits(struct v3d_ra_node_info *nodes,
++set_temp_class_bits(struct v3d_compile *c,
+                     uint32_t temp, uint8_t class_bits)
+ {
+-        nodes->info[temp_to_node(temp)].class_bits = class_bits;
++        c->nodes.info[temp_to_node(c, temp)].class_bits = class_bits;
+ }
+ 
+ static struct ra_class *
+@@ -84,7 +85,7 @@ static inline struct ra_class *
+ choose_reg_class_for_temp(struct v3d_compile *c, uint32_t temp)
+ {
+         assert(temp < c->num_temps && temp < c->nodes.alloc_count);
+-        return choose_reg_class(c, get_temp_class_bits(&c->nodes, temp));
++        return choose_reg_class(c, get_temp_class_bits(c, temp));
+ }
+ 
+ static inline bool
+@@ -313,7 +314,7 @@ v3d_choose_spill_node(struct v3d_compile *c)
+ 
+         for (unsigned i = 0; i < c->num_temps; i++) {
+                 if (BITSET_TEST(c->spillable, i)) {
+-                        ra_set_node_spill_cost(c->g, temp_to_node(i),
++                        ra_set_node_spill_cost(c->g, temp_to_node(c, i),
+                                                spill_costs[i]);
+                 }
+         }
+@@ -482,7 +483,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+                         c->temp_start[i] < ip && c->temp_end[i] >= ip :
+                         c->temp_start[i] <= ip && c->temp_end[i] > ip;
+                 if (thrsw_cross) {
+-                        ra_set_node_class(c->g, temp_to_node(i),
++                        ra_set_node_class(c->g, temp_to_node(c, i),
+                                           choose_reg_class(c, CLASS_BITS_PHYS));
+                 }
+         }
+@@ -509,8 +510,7 @@ v3d_emit_tmu_spill(struct v3d_compile *c,
+          * same register class bits as the original.
+          */
+         if (inst == position) {
+-                uint8_t class_bits = get_temp_class_bits(&c->nodes,
+-                                                         inst->dst.index);
++                uint8_t class_bits = get_temp_class_bits(c, inst->dst.index);
+                 inst->dst = vir_get_temp(c);
+                 add_node(c, inst->dst.index, class_bits);
+         } else {
+@@ -574,7 +574,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                 reconstruct_op = orig_def->qpu.alu.add.op;
+         }
+ 
+-        uint32_t spill_node = temp_to_node(spill_temp);
++        uint32_t spill_node = temp_to_node(c, spill_temp);
+ 
+         /* We must disable the ldunif optimization if we are spilling uniforms */
+         bool had_disable_ldunif_opt = c->disable_ldunif_opt;
+@@ -739,12 +739,12 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+          * update node priorities based one new liveness data.
+          */
+         uint32_t sb_temp =c->spill_base.index;
+-        uint32_t sb_node = temp_to_node(sb_temp);
++        uint32_t sb_node = temp_to_node(c, sb_temp);
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+                 if (c->temp_end[i] == -1)
+                         continue;
+ 
+-                uint32_t node_i = temp_to_node(i);
++                uint32_t node_i = temp_to_node(c, i);
+                 c->nodes.info[node_i].priority =
+                         c->temp_end[i] - c->temp_start[i];
+ 
+@@ -752,7 +752,7 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                      j < c->num_temps; j++) {
+                         if (interferes(c->temp_start[i], c->temp_end[i],
+                                        c->temp_start[j], c->temp_end[j])) {
+-                                uint32_t node_j = temp_to_node(j);
++                                uint32_t node_j = temp_to_node(c, j);
+                                 ra_add_node_interference(c->g, node_i, node_j);
+                         }
+                 }
+@@ -958,7 +958,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
++                                                         temp_to_node(c, i),
+                                                          acc_nodes[3]);
+                         }
+                 }
+@@ -968,7 +968,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
++                                                         temp_to_node(c, i),
+                                                          acc_nodes[4]);
+                         }
+                 }
+@@ -987,7 +987,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          * decides whether the LDVPM is in or out)
+                          */
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
++                        set_temp_class_bits(c, inst->dst.index,
+                                             CLASS_BITS_PHYS);
+                         break;
+                 }
+@@ -1002,7 +1002,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          * phys regfile.
+                          */
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
++                        set_temp_class_bits(c, inst->dst.index,
+                                             CLASS_BITS_PHYS);
+                         break;
+                 }
+@@ -1024,7 +1024,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          */
+                         assert(inst->qpu.alu.mul.op == V3D_QPU_M_MOV);
+                         assert(inst->dst.file == QFILE_TEMP);
+-                        uint32_t node = temp_to_node(inst->dst.index);
++                        uint32_t node = temp_to_node(c, inst->dst.index);
+                         ra_set_node_reg(c->g, node,
+                                         PHYS_INDEX + inst->src[0].index);
+                         break;
+@@ -1043,9 +1043,9 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                  */
+                 if (!inst->qpu.sig.ldunif) {
+                         uint8_t class_bits =
+-                                get_temp_class_bits(&c->nodes, inst->dst.index) &
++                                get_temp_class_bits(c, inst->dst.index) &
+                                 ~CLASS_BITS_R5;
+-                        set_temp_class_bits(&c->nodes, inst->dst.index,
++                        set_temp_class_bits(c, inst->dst.index,
+                                             class_bits);
+ 
+                 } else {
+@@ -1054,7 +1054,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                          * loads interfere with each other.
+                          */
+                         if (c->devinfo->ver < 40) {
+-                                set_temp_class_bits(&c->nodes, inst->dst.index,
++                                set_temp_class_bits(c, inst->dst.index,
+                                                     CLASS_BITS_R5);
+                         }
+                 }
+@@ -1064,7 +1064,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+         if (inst->qpu.sig.thrsw) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+-                                set_temp_class_bits(&c->nodes, i,
++                                set_temp_class_bits(c, i,
+                                                     CLASS_BITS_PHYS);
+                         }
+                 }
+@@ -1125,7 +1125,7 @@ v3d_register_allocate(struct v3d_compile *c)
+                         c->nodes.info[i].priority = 0;
+                         c->nodes.info[i].class_bits = 0;
+                 } else {
+-                        uint32_t t = node_to_temp(i);
++                        uint32_t t = node_to_temp(c, i);
+                         c->nodes.info[i].priority =
+                                 c->temp_end[t] - c->temp_start[t];
+                         c->nodes.info[i].class_bits = CLASS_BITS_ANY;
+@@ -1143,7 +1143,7 @@ v3d_register_allocate(struct v3d_compile *c)
+ 
+         /* Set the register classes for all our temporaries in the graph */
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+-                ra_set_node_class(c->g, temp_to_node(i),
++                ra_set_node_class(c->g, temp_to_node(c, i),
+                                   choose_reg_class_for_temp(c, i));
+         }
+ 
+@@ -1153,8 +1153,8 @@ v3d_register_allocate(struct v3d_compile *c)
+                         if (interferes(c->temp_start[i], c->temp_end[i],
+                                        c->temp_start[j], c->temp_end[j])) {
+                                 ra_add_node_interference(c->g,
+-                                                         temp_to_node(i),
+-                                                         temp_to_node(j));
++                                                         temp_to_node(c, i),
++                                                         temp_to_node(c, j));
+                         }
+                 }
+         }
+@@ -1171,7 +1171,7 @@ v3d_register_allocate(struct v3d_compile *c)
+                 if (c->spill_size <
+                     V3D_CHANNELS * sizeof(uint32_t) * force_register_spills) {
+                         int node = v3d_choose_spill_node(c);
+-                        uint32_t temp = node_to_temp(node);
++                        uint32_t temp = node_to_temp(c, node);
+                         if (node != -1) {
+                                 v3d_spill_reg(c, acc_nodes, temp);
+                                 continue;
+@@ -1186,7 +1186,7 @@ v3d_register_allocate(struct v3d_compile *c)
+                 if (node == -1)
+                         goto spill_fail;
+ 
+-                uint32_t temp = node_to_temp(node);
++                uint32_t temp = node_to_temp(c, node);
+                 enum temp_spill_type spill_type =
+                         get_spill_type_for_temp(c, temp);
+                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+@@ -1201,7 +1201,7 @@ v3d_register_allocate(struct v3d_compile *c)
+         /* Allocation was successful, build the 'temp -> reg' map */
+         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+-                int ra_reg = ra_get_node_reg(c->g, temp_to_node(i));
++                int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+                 if (ra_reg < PHYS_INDEX) {
+                         temp_registers[i].magic = true;
+                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch b/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch
new file mode 100644
index 0000000000..88f753bb0b
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0018-broadcom-compiler-phys-index-depends-on-hw-version.patch
@@ -0,0 +1,144 @@
+From 9b2dfe0286212aba3687a06023cc5b4ce9944ee0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Mon, 23 Aug 2021 02:18:43 +0200
+Subject: [PATCH 018/142] broadcom/compiler: phys index depends on hw version
+
+For 7.1 there are not accumulators. So we replace the macro with a
+function call.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++-----
+ 1 file changed, 29 insertions(+), 10 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index aa9473d124b..a358b616e13 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -28,9 +28,19 @@
+ 
+ #define ACC_INDEX     0
+ #define ACC_COUNT     6
+-#define PHYS_INDEX    (ACC_INDEX + ACC_COUNT)
+-#define PHYS_COUNT    64
+ 
++#define PHYS_COUNT 64
++
++static uint8_t
++get_phys_index(const struct v3d_device_info *devinfo)
++{
++        if (devinfo->has_accumulators)
++                return ACC_INDEX + ACC_COUNT;
++        else
++                return 0;
++}
++
++/* ACC as accumulator */
+ #define CLASS_BITS_PHYS   (1 << 0)
+ #define CLASS_BITS_ACC    (1 << 1)
+ #define CLASS_BITS_R5     (1 << 4)
+@@ -771,9 +781,11 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+ }
+ 
+ struct v3d_ra_select_callback_data {
++        uint32_t phys_index;
+         uint32_t next_acc;
+         uint32_t next_phys;
+         struct v3d_ra_node_info *nodes;
++        const struct v3d_device_info *devinfo;
+ };
+ 
+ /* Choosing accumulators improves chances of merging QPU instructions
+@@ -794,7 +806,7 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+         static const int available_rf_threshold = 5;
+         int available_rf = 0 ;
+         for (int i = 0; i < PHYS_COUNT; i++) {
+-                if (BITSET_TEST(regs, PHYS_INDEX + i))
++                if (BITSET_TEST(regs, v3d_ra->phys_index + i))
+                         available_rf++;
+                 if (available_rf >= available_rf_threshold)
+                         break;
+@@ -854,7 +866,7 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ {
+         for (int i = 0; i < PHYS_COUNT; i++) {
+                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+-                int phys = PHYS_INDEX + phys_off;
++                int phys = v3d_ra->phys_index + phys_off;
+ 
+                 if (BITSET_TEST(regs, phys)) {
+                         v3d_ra->next_phys = phys_off + 1;
+@@ -896,8 +908,9 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+          * register file can be divided up for fragment shader threading.
+          */
+         int max_thread_index = (compiler->devinfo->ver >= 40 ? 2 : 3);
++        uint8_t phys_index = get_phys_index(compiler->devinfo);
+ 
+-        compiler->regs = ra_alloc_reg_set(compiler, PHYS_INDEX + PHYS_COUNT,
++        compiler->regs = ra_alloc_reg_set(compiler, phys_index + PHYS_COUNT,
+                                           false);
+         if (!compiler->regs)
+                 return false;
+@@ -912,8 +925,8 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+                 compiler->reg_class_phys[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+ 
+-                for (int i = PHYS_INDEX;
+-                     i < PHYS_INDEX + (PHYS_COUNT >> threads); i++) {
++                for (int i = phys_index;
++                     i < phys_index + (PHYS_COUNT >> threads); i++) {
+                         ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
+                         ra_class_add_reg(compiler->reg_class_any[threads], i);
+@@ -1026,7 +1039,8 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                         assert(inst->dst.file == QFILE_TEMP);
+                         uint32_t node = temp_to_node(c, inst->dst.index);
+                         ra_set_node_reg(c->g, node,
+-                                        PHYS_INDEX + inst->src[0].index);
++                                        get_phys_index(c->devinfo) +
++                                        inst->src[0].index);
+                         break;
+                 }
+                 }
+@@ -1086,13 +1100,17 @@ v3d_register_allocate(struct v3d_compile *c)
+                                           c->num_temps + ACC_COUNT),
+         };
+ 
++        uint32_t phys_index = get_phys_index(c->devinfo);
++
+         struct v3d_ra_select_callback_data callback_data = {
++                .phys_index = phys_index,
+                 .next_acc = 0,
+                 /* Start at RF3, to try to keep the TLB writes from using
+                  * RF0-2.
+                  */
+                 .next_phys = 3,
+                 .nodes = &c->nodes,
++                .devinfo = c->devinfo,
+         };
+ 
+         vir_calculate_live_intervals(c);
+@@ -1139,6 +1157,7 @@ v3d_register_allocate(struct v3d_compile *c)
+         vir_for_each_inst_inorder(inst, c) {
+                 inst->ip = ip++;
+                 update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
++
+         }
+ 
+         /* Set the register classes for all our temporaries in the graph */
+@@ -1202,13 +1221,13 @@ v3d_register_allocate(struct v3d_compile *c)
+         temp_registers = calloc(c->num_temps, sizeof(*temp_registers));
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+                 int ra_reg = ra_get_node_reg(c->g, temp_to_node(c, i));
+-                if (ra_reg < PHYS_INDEX) {
++                if (ra_reg < phys_index) {
+                         temp_registers[i].magic = true;
+                         temp_registers[i].index = (V3D_QPU_WADDR_R0 +
+                                                    ra_reg - ACC_INDEX);
+                 } else {
+                         temp_registers[i].magic = false;
+-                        temp_registers[i].index = ra_reg - PHYS_INDEX;
++                        temp_registers[i].index = ra_reg - phys_index;
+                 }
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch b/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch
new file mode 100644
index 0000000000..6689d6ee7f
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0019-broadcom-compiler-don-t-favor-select-accum-registers.patch
@@ -0,0 +1,40 @@
+From da0a3deadf86a46c8323267d3f6a49e442835608 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 17 Sep 2021 01:07:06 +0200
+Subject: [PATCH 019/142] broadcom/compiler: don't favor/select accum registers
+ for hw not supporting it
+
+Note that what we do is to just return false on the favor/select accum
+methods. We could just avoid to call them, but as the select is called
+more than once, it is just easier this way.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index a358b616e13..1f495180784 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -797,6 +797,9 @@ v3d_ra_favor_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                    BITSET_WORD *regs,
+                    int priority)
+ {
++        if (!v3d_ra->devinfo->has_accumulators)
++                return false;
++
+         /* Favor accumulators if we have less that this number of physical
+          * registers. Accumulators have more restrictions (like being
+          * invalidated through thrsw), so running out of physical registers
+@@ -832,6 +835,9 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+                     BITSET_WORD *regs,
+                     unsigned int *out)
+ {
++        if (!v3d_ra->devinfo->has_accumulators)
++                return false;
++
+         /* Choose r5 for our ldunifs if possible (nobody else can load to that
+          * reg, and it keeps the QPU cond field free from being occupied by
+          * ldunifrf).
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch
new file mode 100644
index 0000000000..3085733d38
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0020-broadcom-vir-implement-is_no_op_mov-for-v71.patch
@@ -0,0 +1,105 @@
+From 6c04d7c917da6b38f8b2b4306ab03ed2ab7e6ce0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 9 Sep 2021 00:28:53 +0200
+Subject: [PATCH 020/142] broadcom/vir: implement is_no_op_mov for v71
+
+Did some refactoring/splitting.
+---
+ src/broadcom/compiler/vir_to_qpu.c | 66 ++++++++++++++++++++++++------
+ 1 file changed, 53 insertions(+), 13 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index c8b6e0a91a0..08970d52954 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -129,19 +129,8 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+ }
+ 
+ static bool
+-is_no_op_mov(struct qinst *qinst)
++v3d33_mov_src_and_dst_equal(struct qinst *qinst)
+ {
+-        static const struct v3d_qpu_sig no_sig = {0};
+-
+-        /* Make sure it's just a lone MOV. */
+-        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
+-            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
+-            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
+-            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
+-                return false;
+-        }
+-
+-        /* Check if it's a MOV from a register to itself. */
+         enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
+         if (qinst->qpu.alu.mul.magic_write) {
+                 if (waddr < V3D_QPU_WADDR_R0 || waddr > V3D_QPU_WADDR_R4)
+@@ -168,6 +157,57 @@ is_no_op_mov(struct qinst *qinst)
+                         return false;
+         }
+ 
++        return true;
++}
++
++static bool
++v3d71_mov_src_and_dst_equal(struct qinst *qinst)
++{
++        if (qinst->qpu.alu.mul.magic_write)
++                return false;
++
++        enum v3d_qpu_waddr waddr = qinst->qpu.alu.mul.waddr;
++        int raddr;
++
++        raddr = qinst->qpu.alu.mul.a.raddr;
++        if (raddr != waddr)
++                return false;
++
++        return true;
++}
++
++static bool
++mov_src_and_dst_equal(struct qinst *qinst,
++                      const struct v3d_device_info *devinfo)
++{
++        if (devinfo->ver < 71)
++                return v3d33_mov_src_and_dst_equal(qinst);
++        else
++                return v3d71_mov_src_and_dst_equal(qinst);
++}
++
++
++static bool
++is_no_op_mov(struct qinst *qinst,
++             const struct v3d_device_info *devinfo)
++{
++        static const struct v3d_qpu_sig no_sig = {0};
++
++        /* Make sure it's just a lone MOV. We only check for M_MOV. Although
++         * for V3D 7.x there is also A_MOV, we don't need to check for it as
++         * we always emit using M_MOV. We could use A_MOV later on the
++         * squedule to improve performance
++         */
++        if (qinst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
++            qinst->qpu.alu.mul.op != V3D_QPU_M_MOV ||
++            qinst->qpu.alu.add.op != V3D_QPU_A_NOP ||
++            memcmp(&qinst->qpu.sig, &no_sig, sizeof(no_sig)) != 0) {
++                return false;
++        }
++
++        if (!mov_src_and_dst_equal(qinst, devinfo))
++                return false;
++
+         /* No packing or flags updates, or we need to execute the
+          * instruction.
+          */
+@@ -324,7 +364,7 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 qinst->qpu.alu.mul.waddr = dst.index;
+                                 qinst->qpu.alu.mul.magic_write = dst.magic;
+ 
+-                                if (is_no_op_mov(qinst)) {
++                                if (is_no_op_mov(qinst, c->devinfo)) {
+                                         vir_remove_instruction(c, qinst);
+                                         continue;
+                                 }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch
new file mode 100644
index 0000000000..57bd1ad620
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0021-broadcom-compiler-update-vir_to_qpu-set_src-for-v71.patch
@@ -0,0 +1,104 @@
+From 7b5be2d9b178a45c34c22db2744639a6a8a216d1 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 9 Sep 2021 01:18:54 +0200
+Subject: [PATCH 021/142] broadcom/compiler: update vir_to_qpu::set_src for v71
+
+---
+ src/broadcom/compiler/vir_to_qpu.c | 47 ++++++++++++++++++++++++++----
+ 1 file changed, 42 insertions(+), 5 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index 08970d52954..afc4941fdb1 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -86,12 +86,22 @@ new_qpu_nop_before(struct qinst *inst)
+         return q;
+ }
+ 
++static void
++v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
++{
++        if (src.smimm)
++                unreachable("v3d71_set_src: pending handling small immediates");
++
++        assert(!src.magic);
++        *raddr = src.index;
++}
++
+ /**
+  * Allocates the src register (accumulator or register file) into the RADDR
+  * fields of the instruction.
+  */
+ static void
+-set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
++v3d33_set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+ {
+         if (src.smimm) {
+                 assert(instr->sig.small_imm_b);
+@@ -128,6 +138,24 @@ set_src(struct v3d_qpu_instr *instr, enum v3d_qpu_mux *mux, struct qpu_reg src)
+         }
+ }
+ 
++/*
++ * The main purpose of the following wrapper is to make calling set_src
++ * cleaner. This is the reason it receives both mux and raddr pointers. Those
++ * will be filled or not based on the device version.
++ */
++static void
++set_src(struct v3d_qpu_instr *instr,
++        enum v3d_qpu_mux *mux,
++        uint8_t *raddr,
++        struct qpu_reg src,
++        const struct v3d_device_info *devinfo)
++{
++        if (devinfo->ver < 71)
++                return v3d33_set_src(instr, mux, src);
++        else
++                return v3d71_set_src(instr, raddr, src);
++}
++
+ static bool
+ v3d33_mov_src_and_dst_equal(struct qinst *qinst)
+ {
+@@ -340,13 +368,18 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 qinst->qpu.sig_magic = dst.magic;
+                         } else if (qinst->qpu.alu.add.op != V3D_QPU_A_NOP) {
+                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
++
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.a.mux, src[0]);
++                                                &qinst->qpu.alu.add.a.mux,
++                                                &qinst->qpu.alu.add.a.raddr,
++                                                src[0], c->devinfo);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.add.b.mux, src[1]);
++                                                &qinst->qpu.alu.add.b.mux,
++                                                &qinst->qpu.alu.add.b.raddr,
++                                                src[1], c->devinfo);
+                                 }
+ 
+                                 qinst->qpu.alu.add.waddr = dst.index;
+@@ -354,11 +387,15 @@ v3d_generate_code_block(struct v3d_compile *c,
+                         } else {
+                                 if (nsrc >= 1) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.a.mux, src[0]);
++                                                &qinst->qpu.alu.mul.a.mux,
++                                                &qinst->qpu.alu.mul.a.raddr,
++                                                src[0], c->devinfo);
+                                 }
+                                 if (nsrc >= 2) {
+                                         set_src(&qinst->qpu,
+-                                                &qinst->qpu.alu.mul.b.mux, src[1]);
++                                                &qinst->qpu.alu.mul.b.mux,
++                                                &qinst->qpu.alu.mul.b.raddr,
++                                                src[1], c->devinfo);
+                                 }
+ 
+                                 qinst->qpu.alu.mul.waddr = dst.index;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch b/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch
new file mode 100644
index 0000000000..519e72d917
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0022-broadcom-qpu_schedule-add-process_raddr_deps.patch
@@ -0,0 +1,92 @@
+From fe89703008f2a3d6bfe6e260791f712013be5e48 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 9 Sep 2021 23:59:28 +0200
+Subject: [PATCH 022/142] broadcom/qpu_schedule: add process_raddr_deps
+
+On v71 we don't have muxes, but more raddr. Adding a equivalent add
+deps function.
+---
+ src/broadcom/compiler/qpu_schedule.c | 52 +++++++++++++++++++++++-----
+ 1 file changed, 44 insertions(+), 8 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 455fa3867be..89254643c90 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -155,6 +155,7 @@ static void
+ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+                  enum v3d_qpu_mux mux)
+ {
++        assert(state->devinfo->ver < 71);
+         switch (mux) {
+         case V3D_QPU_MUX_A:
+                 add_read_dep(state, state->last_rf[n->inst->qpu.raddr_a], n);
+@@ -171,6 +172,17 @@ process_mux_deps(struct schedule_state *state, struct schedule_node *n,
+         }
+ }
+ 
++
++static void
++process_raddr_deps(struct schedule_state *state, struct schedule_node *n,
++                   uint8_t raddr, bool is_small_imm)
++{
++        assert(state->devinfo->ver >= 71);
++
++        if (!is_small_imm)
++                add_read_dep(state, state->last_rf[raddr], n);
++}
++
+ static bool
+ tmu_write_is_sequence_terminator(uint32_t waddr)
+ {
+@@ -305,15 +317,39 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+ 
+         /* XXX: LOAD_IMM */
+ 
+-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0)
+-                process_mux_deps(state, n, inst->alu.add.a.mux);
+-        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1)
+-                process_mux_deps(state, n, inst->alu.add.b.mux);
++        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.add.a.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.add.a.raddr,
++                                           inst->sig.small_imm_a);
++                }
++        }
++        if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.add.b.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.add.b.raddr,
++                                           inst->sig.small_imm_b);
++                }
++        }
+ 
+-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0)
+-                process_mux_deps(state, n, inst->alu.mul.a.mux);
+-        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1)
+-                process_mux_deps(state, n, inst->alu.mul.b.mux);
++        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.mul.a.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.mul.a.raddr,
++                                           inst->sig.small_imm_c);
++                }
++        }
++        if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
++                if (devinfo->ver < 71) {
++                        process_mux_deps(state, n, inst->alu.mul.b.mux);
++                } else {
++                        process_raddr_deps(state, n, inst->alu.mul.b.raddr,
++                                           inst->sig.small_imm_d);
++                }
++        }
+ 
+         switch (inst->alu.add.op) {
+         case V3D_QPU_A_VPMSETUP:
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch
new file mode 100644
index 0000000000..e16ff0f540
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0023-broadcom-qpu-update-disasm_raddr-for-v71.patch
@@ -0,0 +1,128 @@
+From 20ce426df1ab2546332141f4bc4531ada754cdea Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 10 Sep 2021 01:20:44 +0200
+Subject: [PATCH 023/142] broadcom/qpu: update disasm_raddr for v71
+
+---
+ src/broadcom/qpu/qpu_disasm.c | 72 ++++++++++++++++++++++++++++++++---
+ 1 file changed, 66 insertions(+), 6 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
+index 588a665f770..b613de781dc 100644
+--- a/src/broadcom/qpu/qpu_disasm.c
++++ b/src/broadcom/qpu/qpu_disasm.c
+@@ -56,8 +56,9 @@ pad_to(struct disasm_state *disasm, int n)
+ 
+ 
+ static void
+-v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+-                     const struct v3d_qpu_instr *instr, uint8_t mux)
++v3d33_qpu_disasm_raddr(struct disasm_state *disasm,
++                       const struct v3d_qpu_instr *instr,
++                       enum v3d_qpu_mux mux)
+ {
+         if (mux == V3D_QPU_MUX_A) {
+                 append(disasm, "rf%d", instr->raddr_a);
+@@ -82,6 +83,65 @@ v3d_qpu_disasm_raddr(struct disasm_state *disasm,
+         }
+ }
+ 
++enum v3d_qpu_input_class {
++        V3D_QPU_ADD_A,
++        V3D_QPU_ADD_B,
++        V3D_QPU_MUL_A,
++        V3D_QPU_MUL_B
++};
++
++static void
++v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
++                       const struct v3d_qpu_instr *instr,
++                       uint8_t raddr,
++                       enum v3d_qpu_input_class input_class)
++{
++        bool is_small_imm = false;
++        switch(input_class) {
++        case V3D_QPU_ADD_A:
++                is_small_imm = instr->sig.small_imm_a;
++                break;
++        case V3D_QPU_ADD_B:
++                is_small_imm = instr->sig.small_imm_b;
++                break;
++        case V3D_QPU_MUL_A:
++                is_small_imm = instr->sig.small_imm_c;
++                break;
++        case V3D_QPU_MUL_B:
++                is_small_imm = instr->sig.small_imm_d;
++                break;
++        }
++
++        if (is_small_imm) {
++                unreachable("Pending handling small immediates");
++                uint32_t val;
++                ASSERTED bool ok =
++                        v3d_qpu_small_imm_unpack(disasm->devinfo,
++                                                 raddr,
++                                                 &val);
++
++                if ((int)val >= -16 && (int)val <= 15)
++                        append(disasm, "%d", val);
++                else
++                        append(disasm, "0x%08x", val);
++                assert(ok);
++        } else {
++                append(disasm, "rf%d", raddr);
++        }
++}
++
++static void
++v3d_qpu_disasm_raddr(struct disasm_state *disasm,
++                     const struct v3d_qpu_instr *instr,
++                     const struct v3d_qpu_input *input,
++                     enum v3d_qpu_input_class input_class)
++{
++        if (disasm->devinfo->ver < 71)
++                v3d33_qpu_disasm_raddr(disasm, instr, input->mux);
++        else
++                v3d71_qpu_disasm_raddr(disasm, instr, input->raddr, input_class);
++}
++
+ static void
+ v3d_qpu_disasm_waddr(struct disasm_state *disasm, uint32_t waddr, bool magic)
+ {
+@@ -121,14 +181,14 @@ v3d_qpu_disasm_add(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.a.mux);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.a, V3D_QPU_ADD_A);
+                 append(disasm, "%s",
+                        v3d_qpu_unpack_name(instr->alu.add.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.add.b.mux);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.add.b, V3D_QPU_ADD_B);
+                 append(disasm, "%s",
+                        v3d_qpu_unpack_name(instr->alu.add.b.unpack));
+         }
+@@ -164,14 +224,14 @@ v3d_qpu_disasm_mul(struct disasm_state *disasm,
+         if (num_src >= 1) {
+                 if (has_dst)
+                         append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.a.mux);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.a, V3D_QPU_MUL_A);
+                 append(disasm, "%s",
+                        v3d_qpu_unpack_name(instr->alu.mul.a.unpack));
+         }
+ 
+         if (num_src >= 2) {
+                 append(disasm, ", ");
+-                v3d_qpu_disasm_raddr(disasm, instr, instr->alu.mul.b.mux);
++                v3d_qpu_disasm_raddr(disasm, instr, &instr->alu.mul.b, V3D_QPU_MUL_B);
+                 append(disasm, "%s",
+                        v3d_qpu_unpack_name(instr->alu.mul.b.unpack));
+         }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch b/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch
new file mode 100644
index 0000000000..3b82c34ea8
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0024-broadcom-qpu-return-false-on-qpu_writes_accumulatorX.patch
@@ -0,0 +1,59 @@
+From 7263fa24a3c57b1dcd4d870670cda86ae89aa28c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 15 Sep 2021 10:55:49 +0200
+Subject: [PATCH 024/142] broadcom/qpu: return false on
+ qpu_writes_accumulatorXX helpers for v71
+
+As for v71 doesn't have accumulators (devinfo->has_accumulators set to
+false), those methods would always return false.
+---
+ src/broadcom/qpu/qpu_instr.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 8de99c611d5..7ec3c867260 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -854,6 +854,9 @@ bool
+ v3d_qpu_writes_r3(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if(!devinfo->has_accumulators)
++                return false;
++
+         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R3))
+                 return true;
+ 
+@@ -864,6 +867,9 @@ bool
+ v3d_qpu_writes_r4(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                     inst->alu.add.magic_write &&
+@@ -894,6 +900,9 @@ bool
+ v3d_qpu_writes_r5(const struct v3d_device_info *devinfo,
+                   const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (qpu_writes_magic_waddr_explicitly(devinfo, inst, V3D_QPU_WADDR_R5))
+                 return true;
+ 
+@@ -904,6 +913,9 @@ bool
+ v3d_qpu_writes_accum(const struct v3d_device_info *devinfo,
+                      const struct v3d_qpu_instr *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         if (v3d_qpu_writes_r5(devinfo, inst))
+                 return true;
+         if (v3d_qpu_writes_r4(devinfo, inst))
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch b/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch
new file mode 100644
index 0000000000..2552764a9e
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0025-broadcom-compiler-add-support-for-varyings-on-nir-to.patch
@@ -0,0 +1,116 @@
+From 6a9611c5a22218388bba419174d3343e0cdf773b Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 14 Sep 2021 10:42:55 +0200
+Subject: [PATCH 025/142] broadcom/compiler: add support for varyings on nir to
+ vir generation for v71
+
+Needs update as v71 doesn't have accumulators anymore, and ldvary uses
+now rf0 to return the value.
+---
+ src/broadcom/compiler/nir_to_vir.c | 34 +++++++++++++++++-------------
+ 1 file changed, 19 insertions(+), 15 deletions(-)
+
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index ca072971f01..79a22c3bd08 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -1005,32 +1005,36 @@ emit_fragcoord_input(struct v3d_compile *c, int attr)
+ 
+ static struct qreg
+ emit_smooth_varying(struct v3d_compile *c,
+-                    struct qreg vary, struct qreg w, struct qreg r5)
++                    struct qreg vary, struct qreg w, struct qreg c_reg)
+ {
+-        return vir_FADD(c, vir_FMUL(c, vary, w), r5);
++        return vir_FADD(c, vir_FMUL(c, vary, w), c_reg);
+ }
+ 
+ static struct qreg
+ emit_noperspective_varying(struct v3d_compile *c,
+-                           struct qreg vary, struct qreg r5)
++                           struct qreg vary, struct qreg c_reg)
+ {
+-        return vir_FADD(c, vir_MOV(c, vary), r5);
++        return vir_FADD(c, vir_MOV(c, vary), c_reg);
+ }
+ 
+ static struct qreg
+ emit_flat_varying(struct v3d_compile *c,
+-                  struct qreg vary, struct qreg r5)
++                  struct qreg vary, struct qreg c_reg)
+ {
+         vir_MOV_dest(c, c->undef, vary);
+-        return vir_MOV(c, r5);
++        return vir_MOV(c, c_reg);
+ }
+ 
+ static struct qreg
+ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                       int8_t input_idx, uint8_t swizzle, int array_index)
+ {
+-        struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
+-        struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
++        struct qreg c_reg; /* C coefficient */
++
++        if (c->devinfo->has_accumulators)
++                c_reg = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5);
++        else
++                c_reg = vir_reg(QFILE_REG, 0);
+ 
+         struct qinst *ldvary = NULL;
+         struct qreg vary;
+@@ -1041,7 +1045,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                 vary = vir_emit_def(c, ldvary);
+         } else {
+                 vir_NOP(c)->qpu.sig.ldvary = true;
+-                vary = r3;
++                vary = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3);
+         }
+ 
+         /* Store the input value before interpolation so we can implement
+@@ -1050,7 +1054,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+         if (input_idx >= 0) {
+                 assert(var);
+                 c->interp[input_idx].vp = vary;
+-                c->interp[input_idx].C = vir_MOV(c, r5);
++                c->interp[input_idx].C = vir_MOV(c, c_reg);
+                 c->interp[input_idx].mode = var->data.interpolation;
+         }
+ 
+@@ -1060,7 +1064,7 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+          */
+         if (!var) {
+                 assert(input_idx < 0);
+-                return emit_smooth_varying(c, vary, c->payload_w, r5);
++                return emit_smooth_varying(c, vary, c->payload_w, c_reg);
+         }
+ 
+         int i = c->num_inputs++;
+@@ -1075,20 +1079,20 @@ emit_fragment_varying(struct v3d_compile *c, nir_variable *var,
+                 if (var->data.centroid) {
+                         BITSET_SET(c->centroid_flags, i);
+                         result = emit_smooth_varying(c, vary,
+-                                                     c->payload_w_centroid, r5);
++                                                     c->payload_w_centroid, c_reg);
+                 } else {
+-                        result = emit_smooth_varying(c, vary, c->payload_w, r5);
++                        result = emit_smooth_varying(c, vary, c->payload_w, c_reg);
+                 }
+                 break;
+ 
+         case INTERP_MODE_NOPERSPECTIVE:
+                 BITSET_SET(c->noperspective_flags, i);
+-                result = emit_noperspective_varying(c, vary, r5);
++                result = emit_noperspective_varying(c, vary, c_reg);
+                 break;
+ 
+         case INTERP_MODE_FLAT:
+                 BITSET_SET(c->flat_shade_flags, i);
+-                result = emit_flat_varying(c, vary, r5);
++                result = emit_flat_varying(c, vary, c_reg);
+                 break;
+ 
+         default:
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch
new file mode 100644
index 0000000000..7302726b66
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0026-broadcom-compiler-payload_w-is-loaded-on-rf3-for-v71.patch
@@ -0,0 +1,55 @@
+From 06af15a60f7a9c135893e5f8934b8030c1da95f9 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 15 Sep 2021 01:14:15 +0200
+Subject: [PATCH 026/142] broadcom/compiler: payload_w is loaded on rf3 for v71
+
+And in general rf0 is now used for other needs.
+---
+ src/broadcom/compiler/nir_to_vir.c            | 6 +++++-
+ src/broadcom/compiler/vir_register_allocate.c | 6 +++++-
+ 2 files changed, 10 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index 79a22c3bd08..1a05b279a2d 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -4325,7 +4325,11 @@ nir_to_vir(struct v3d_compile *c)
+ {
+         switch (c->s->info.stage) {
+         case MESA_SHADER_FRAGMENT:
+-                c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                if (c->devinfo->ver < 71)
++                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                else
++                        c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 3));
++
+                 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1));
+                 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2));
+ 
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 1f495180784..eca9a6751a6 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -1034,6 +1034,11 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+         if (inst->src[0].file == QFILE_REG) {
+                 switch (inst->src[0].index) {
+                 case 0:
++                        /* V3D 7.x doesn't use rf0 for thread payload */
++                        if (c->devinfo->ver >= 71)
++                                break;
++                        else
++                                FALLTHROUGH;
+                 case 1:
+                 case 2:
+                 case 3: {
+@@ -1163,7 +1168,6 @@ v3d_register_allocate(struct v3d_compile *c)
+         vir_for_each_inst_inorder(inst, c) {
+                 inst->ip = ip++;
+                 update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
+-
+         }
+ 
+         /* Set the register classes for all our temporaries in the graph */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch
new file mode 100644
index 0000000000..05010aadd8
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0027-broadcom-qpu_schedule-update-write-deps-for-v71.patch
@@ -0,0 +1,30 @@
+From d38d8056903b9a4f96ab56261ac3b3c3be0af4fb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 15 Sep 2021 11:12:59 +0200
+Subject: [PATCH 027/142] broadcom/qpu_schedule: update write deps for v71
+
+We just need to add a write dep if rf0 is written implicitly.
+
+Note that we don't need to check if we have accumulators when checking
+for r3/r4/r5, as v3d_qpu_writes_rX would return false for hw version
+that doesn't have accumulators.
+---
+ src/broadcom/compiler/qpu_schedule.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 89254643c90..2fa9031d7b6 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -422,6 +422,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+                 add_write_dep(state, &state->last_r[4], n);
+         if (v3d_qpu_writes_r5(devinfo, inst))
+                 add_write_dep(state, &state->last_r[5], n);
++        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst))
++                add_write_dep(state, &state->last_rf[0], n);
+ 
+         /* If we add any more dependencies here we should consider whether we
+          * also need to update qpu_inst_after_thrsw_valid_in_delay_slot.
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch b/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch
new file mode 100644
index 0000000000..76985d943a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0028-broadcom-compiler-update-register-classes-to-not-inc.patch
@@ -0,0 +1,140 @@
+From 7e2a2be830b1672ab846389a46b5d09bad0f7a98 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 16 Sep 2021 00:49:25 +0200
+Subject: [PATCH 028/142] broadcom/compiler: update register classes to not
+ include accumulators on v71
+
+---
+ src/broadcom/compiler/vir_register_allocate.c | 56 ++++++++++++-------
+ 1 file changed, 36 insertions(+), 20 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index eca9a6751a6..7b3f6c41934 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -44,10 +44,15 @@ get_phys_index(const struct v3d_device_info *devinfo)
+ #define CLASS_BITS_PHYS   (1 << 0)
+ #define CLASS_BITS_ACC    (1 << 1)
+ #define CLASS_BITS_R5     (1 << 4)
+-#define CLASS_BITS_ANY    (CLASS_BITS_PHYS | \
+-                           CLASS_BITS_ACC | \
+-                           CLASS_BITS_R5)
+ 
++static uint8_t
++get_class_bit_any(const struct v3d_device_info *devinfo)
++{
++        if (devinfo->has_accumulators)
++                return (CLASS_BITS_PHYS | CLASS_BITS_ACC | CLASS_BITS_R5);
++        else
++                return CLASS_BITS_PHYS;
++}
+ static inline uint32_t
+ temp_to_node(struct v3d_compile *c, uint32_t temp)
+ {
+@@ -82,11 +87,13 @@ choose_reg_class(struct v3d_compile *c, uint8_t class_bits)
+         if (class_bits == CLASS_BITS_PHYS) {
+                 return c->compiler->reg_class_phys[c->thread_index];
+         } else if (class_bits == (CLASS_BITS_R5)) {
++                assert(c->devinfo->has_accumulators);
+                 return c->compiler->reg_class_r5[c->thread_index];
+         } else if (class_bits == (CLASS_BITS_PHYS | CLASS_BITS_ACC)) {
++                assert(c->devinfo->has_accumulators);
+                 return c->compiler->reg_class_phys_or_acc[c->thread_index];
+         } else {
+-                assert(class_bits == CLASS_BITS_ANY);
++                assert(class_bits == get_class_bit_any(c->devinfo));
+                 return c->compiler->reg_class_any[c->thread_index];
+         }
+ }
+@@ -447,7 +454,7 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+          */
+         assert(c->disable_ldunif_opt);
+         struct qreg offset = vir_uniform_ui(c, spill_offset);
+-        add_node(c, offset.index, CLASS_BITS_ANY);
++        add_node(c, offset.index, get_class_bit_any(c->devinfo));
+ 
+         /* We always enable per-quad on spills/fills to ensure we spill
+          * any channels involved with helper invocations.
+@@ -645,7 +652,8 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                                          * instruction immediately after, so
+                                          * we can use any register class for it.
+                                          */
+-                                        add_node(c, unif.index, CLASS_BITS_ANY);
++                                        add_node(c, unif.index,
++                                                 get_class_bit_any(c->devinfo));
+                                 } else if (spill_type == SPILL_TYPE_RECONSTRUCT) {
+                                         struct qreg temp =
+                                                 reconstruct_temp(c, reconstruct_op);
+@@ -924,31 +932,38 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+         for (int threads = 0; threads < max_thread_index; threads++) {
+                 compiler->reg_class_any[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+-                compiler->reg_class_r5[threads] =
+-                        ra_alloc_contig_reg_class(compiler->regs, 1);
+-                compiler->reg_class_phys_or_acc[threads] =
+-                        ra_alloc_contig_reg_class(compiler->regs, 1);
++                if (compiler->devinfo->has_accumulators) {
++                        compiler->reg_class_r5[threads] =
++                                ra_alloc_contig_reg_class(compiler->regs, 1);
++                        compiler->reg_class_phys_or_acc[threads] =
++                                ra_alloc_contig_reg_class(compiler->regs, 1);
++                }
+                 compiler->reg_class_phys[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+ 
+                 for (int i = phys_index;
+                      i < phys_index + (PHYS_COUNT >> threads); i++) {
+-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
++                        if (compiler->devinfo->has_accumulators)
++                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                         ra_class_add_reg(compiler->reg_class_phys[threads], i);
+                         ra_class_add_reg(compiler->reg_class_any[threads], i);
+                 }
+ 
+-                for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+-                        ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+-                        ra_class_add_reg(compiler->reg_class_any[threads], i);
++                if (compiler->devinfo->has_accumulators) {
++                        for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
++                                ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
++                                ra_class_add_reg(compiler->reg_class_any[threads], i);
++                        }
+                 }
+                 /* r5 can only store a single 32-bit value, so not much can
+                  * use it.
+                  */
+-                ra_class_add_reg(compiler->reg_class_r5[threads],
+-                                 ACC_INDEX + 5);
+-                ra_class_add_reg(compiler->reg_class_any[threads],
+-                                 ACC_INDEX + 5);
++                if (compiler->devinfo->has_accumulators) {
++                        ra_class_add_reg(compiler->reg_class_r5[threads],
++                                         ACC_INDEX + 5);
++                        ra_class_add_reg(compiler->reg_class_any[threads],
++                                         ACC_INDEX + 5);
++                }
+         }
+ 
+         ra_set_finalize(compiler->regs, NULL);
+@@ -1086,7 +1101,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+         }
+ 
+         /* All accumulators are invalidated across a thread switch. */
+-        if (inst->qpu.sig.thrsw) {
++        if (inst->qpu.sig.thrsw && c->devinfo->has_accumulators) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 set_temp_class_bits(c, i,
+@@ -1157,7 +1172,8 @@ v3d_register_allocate(struct v3d_compile *c)
+                         uint32_t t = node_to_temp(c, i);
+                         c->nodes.info[i].priority =
+                                 c->temp_end[t] - c->temp_start[t];
+-                        c->nodes.info[i].class_bits = CLASS_BITS_ANY;
++                        c->nodes.info[i].class_bits =
++                                get_class_bit_any(c->devinfo);
+                 }
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch b/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch
new file mode 100644
index 0000000000..4af561fa4a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0029-broadcom-compiler-implement-reads-writes-too-soon-ch.patch
@@ -0,0 +1,109 @@
+From 0157228c729b8812dc4900fa24db63b7d27aa342 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 23 Sep 2021 11:19:58 +0200
+Subject: [PATCH 029/142] broadcom/compiler: implement "reads/writes too soon"
+ checks for v71
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 65 ++++++++++++++++++++++------
+ 1 file changed, 51 insertions(+), 14 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 2fa9031d7b6..4db0c2e72da 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -562,7 +562,24 @@ mux_reads_too_soon(struct choose_scoreboard *scoreboard,
+ }
+ 
+ static bool
+-reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
++reads_too_soon(struct choose_scoreboard *scoreboard,
++               const struct v3d_qpu_instr *inst, uint8_t raddr)
++{
++        switch (raddr) {
++        case 0: /* ldvary delayed write of C coefficient to rf0 */
++                if (scoreboard->tick - scoreboard->last_ldvary_tick <= 1)
++                        return true;
++                break;
++        default:
++                break;
++        }
++
++        return false;
++}
++
++static bool
++reads_too_soon_after_write(const struct v3d_device_info *devinfo,
++                           struct choose_scoreboard *scoreboard,
+                            struct qinst *qinst)
+ {
+         const struct v3d_qpu_instr *inst = &qinst->qpu;
+@@ -574,24 +591,44 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard,
+         assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
+ 
+         if (inst->alu.add.op != V3D_QPU_A_NOP) {
+-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux)) {
+-                        return true;
++                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 0) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.a.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.add.a.raddr))
++                                        return true;
++                        }
+                 }
+-                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux)) {
+-                        return true;
++                if (v3d_qpu_add_op_num_src(inst->alu.add.op) > 1) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.add.b.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.add.b.raddr))
++                                        return true;
++                        }
+                 }
+         }
+ 
+         if (inst->alu.mul.op != V3D_QPU_M_NOP) {
+-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux)) {
+-                        return true;
++                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 0) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.a.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
++                                        return true;
++                        }
+                 }
+-                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1 &&
+-                    mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux)) {
+-                        return true;
++                if (v3d_qpu_mul_op_num_src(inst->alu.mul.op) > 1) {
++                        if (devinfo->ver < 71) {
++                                if (mux_reads_too_soon(scoreboard, inst, inst->alu.mul.b.mux))
++                                        return true;
++                        } else {
++                                if (reads_too_soon(scoreboard, inst, inst->alu.mul.b.raddr))
++                                        return true;
++                        }
+                 }
+         }
+ 
+@@ -1147,7 +1184,7 @@ retry:
+                  *  regfile A or B that was written to by the previous
+                  *  instruction."
+                  */
+-                if (reads_too_soon_after_write(scoreboard, n->inst))
++                if (reads_too_soon_after_write(c->devinfo, scoreboard, n->inst))
+                         continue;
+ 
+                 if (writes_too_soon_after_write(c->devinfo, scoreboard, n->inst))
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch
new file mode 100644
index 0000000000..9704a18a6b
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0030-broadcom-compiler-implement-read-stall-check-for-v71.patch
@@ -0,0 +1,118 @@
+From 3fb3333bdf9699157cf0a2bd46ba4c25058bc5c1 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 23 Sep 2021 11:44:59 +0200
+Subject: [PATCH 030/142] broadcom/compiler: implement read stall check for v71
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 32 +++++++++++++++++-----------
+ src/broadcom/qpu/qpu_instr.c         | 12 +++++++++++
+ src/broadcom/qpu/qpu_instr.h         |  2 ++
+ 3 files changed, 34 insertions(+), 12 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 4db0c2e72da..b78abe003e9 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -679,29 +679,37 @@ pixel_scoreboard_too_soon(struct v3d_compile *c,
+ }
+ 
+ static bool
+-qpu_instruction_uses_rf(const struct v3d_qpu_instr *inst,
++qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
++                        const struct v3d_qpu_instr *inst,
+                         uint32_t waddr) {
+ 
+         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+            return false;
+ 
+-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
+-            inst->raddr_a == waddr)
+-              return true;
++        if (devinfo->ver < 71) {
++                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A) &&
++                    inst->raddr_a == waddr)
++                        return true;
+ 
+-        if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
+-            !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+-              return true;
++                if (v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B) &&
++                    !inst->sig.small_imm_b && (inst->raddr_b == waddr))
++                        return true;
++        } else {
++                /* FIXME: skip if small immediate */
++                if (v3d71_qpu_reads_raddr(inst, waddr))
++                        return true;
++        }
+ 
+         return false;
+ }
+ 
+ static bool
+-mux_read_stalls(struct choose_scoreboard *scoreboard,
+-                const struct v3d_qpu_instr *inst)
++read_stalls(const struct v3d_device_info *devinfo,
++            struct choose_scoreboard *scoreboard,
++            const struct v3d_qpu_instr *inst)
+ {
+         return scoreboard->tick == scoreboard->last_stallable_sfu_tick + 1 &&
+-                qpu_instruction_uses_rf(inst,
++                qpu_instruction_uses_rf(devinfo, inst,
+                                         scoreboard->last_stallable_sfu_reg);
+ }
+ 
+@@ -1319,7 +1327,7 @@ retry:
+ 
+                 int prio = get_instruction_priority(c->devinfo, inst);
+ 
+-                if (mux_read_stalls(scoreboard, inst)) {
++                if (read_stalls(c->devinfo, scoreboard, inst)) {
+                         /* Don't merge an instruction that stalls */
+                         if (prev_inst)
+                                 continue;
+@@ -2389,7 +2397,7 @@ schedule_instructions(struct v3d_compile *c,
+                                         }
+                                 }
+                         }
+-                        if (mux_read_stalls(scoreboard, inst))
++                        if (read_stalls(c->devinfo, scoreboard, inst))
+                                 c->qpu_inst_stalled_count++;
+                 }
+ 
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 7ec3c867260..e8bbb2141b0 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -956,6 +956,18 @@ v3d_qpu_uses_mux(const struct v3d_qpu_instr *inst, enum v3d_qpu_mux mux)
+                 (mul_nsrc > 1 && inst->alu.mul.b.mux == mux));
+ }
+ 
++bool
++v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
++{
++        int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
++        int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
++
++        return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
++               (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
++               (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
++               (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
++}
++
+ bool
+ v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
+                            const struct v3d_qpu_sig *sig)
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index a25be8e0ee6..9f7582ab06d 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -494,4 +494,6 @@ bool v3d_qpu_unpacks_f32(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ 
+ bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
++
++bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
+ #endif
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch b/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch
new file mode 100644
index 0000000000..3aec307f63
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0031-broadcom-compiler-add-a-v3d71_qpu_writes_waddr_expli.patch
@@ -0,0 +1,65 @@
+From cbe0a7a06a5fb9b3f28acba8c9cac362a6bc5324 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 6 Oct 2021 13:58:00 +0200
+Subject: [PATCH 031/142] broadcom/compiler: add a
+ v3d71_qpu_writes_waddr_explicitly helper
+
+---
+ src/broadcom/qpu/qpu_instr.c | 28 ++++++++++++++++++++++++++++
+ src/broadcom/qpu/qpu_instr.h |  3 +++
+ 2 files changed, 31 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index e8bbb2141b0..feb6b343c1c 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -968,6 +968,34 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+                (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
+ }
+ 
++bool
++v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
++                                  const struct v3d_qpu_instr *inst,
++                                  uint8_t waddr)
++{
++        if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
++                return false;
++
++        if (v3d_qpu_add_op_has_dst(inst->alu.add.op) &&
++            !inst->alu.add.magic_write &&
++            inst->alu.add.waddr == waddr) {
++                return true;
++        }
++
++        if (v3d_qpu_mul_op_has_dst(inst->alu.mul.op) &&
++            !inst->alu.mul.magic_write &&
++            inst->alu.mul.waddr == waddr) {
++                return true;
++        }
++
++        if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
++            !inst->sig_magic && inst->sig_addr == waddr) {
++                return true;
++        }
++
++        return false;
++}
++
+ bool
+ v3d_qpu_sig_writes_address(const struct v3d_device_info *devinfo,
+                            const struct v3d_qpu_sig *sig)
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 9f7582ab06d..50a69ce8c3a 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -496,4 +496,7 @@ bool v3d_qpu_unpacks_f16(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_is_nop(struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ 
+ bool v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr);
++bool v3d71_qpu_writes_waddr_explicitly(const struct v3d_device_info *devinfo,
++                                       const struct v3d_qpu_instr *inst,
++                                       uint8_t waddr);
+ #endif
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch b/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch
new file mode 100644
index 0000000000..f5e3fb5f22
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0032-broadcom-compiler-prevent-rf2-3-usage-in-thread-end-.patch
@@ -0,0 +1,67 @@
+From 92e91a9b22ae61dc9f39880e8fdaa7714789efdb Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 27 Sep 2021 11:49:24 +0200
+Subject: [PATCH 032/142] broadcom/compiler: prevent rf2-3 usage in thread end
+ delay slots for v71
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+---
+ src/broadcom/compiler/qpu_schedule.c | 37 +++++++++++++++++++++-------
+ 1 file changed, 28 insertions(+), 9 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index b78abe003e9..839c0c62315 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1691,16 +1691,35 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
+                         return false;
+ 
+-                /* RF0-2 might be overwritten during the delay slots by
+-                 * fragment shader setup.
+-                 */
+-                if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
+-                        return false;
++                if (c->devinfo->ver <= 42) {
++                        /* RF0-2 might be overwritten during the delay slots by
++                         * fragment shader setup.
++                         */
++                        if (inst->raddr_a < 3 && v3d_qpu_uses_mux(inst, V3D_QPU_MUX_A))
++                                return false;
+ 
+-                if (inst->raddr_b < 3 &&
+-                    !inst->sig.small_imm_b &&
+-                    v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
+-                        return false;
++                        if (inst->raddr_b < 3 &&
++                            !inst->sig.small_imm_b &&
++                            v3d_qpu_uses_mux(inst, V3D_QPU_MUX_B)) {
++                                return false;
++                        }
++                }
++
++                if (c->devinfo->ver >= 71) {
++                        /* RF2-3 might be overwritten during the delay slots by
++                         * fragment shader setup.
++                         *
++                         * FIXME: handle small immediate cases
++                         */
++                        if (v3d71_qpu_reads_raddr(inst, 2) ||
++                            v3d71_qpu_reads_raddr(inst, 3)) {
++                                return false;
++                        }
++
++                        if (v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 2) ||
++                            v3d71_qpu_writes_waddr_explicitly(c->devinfo, inst, 3)) {
++                                return false;
++                        }
+                 }
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch
new file mode 100644
index 0000000000..4a2b89038b
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0033-broadcom-qpu-add-new-ADD-opcodes-for-FMOV-MOV-in-v71.patch
@@ -0,0 +1,78 @@
+From 68a1545eb973e41608534ff05a9e84a86c046453 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 27 Sep 2021 13:26:04 +0200
+Subject: [PATCH 033/142] broadcom/qpu: add new ADD opcodes for FMOV/MOV in v71
+
+---
+ src/broadcom/qpu/qpu_instr.c |  5 +++++
+ src/broadcom/qpu/qpu_instr.h |  4 ++++
+ src/broadcom/qpu/qpu_pack.c  | 15 +++++++++++++++
+ 3 files changed, 24 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index feb6b343c1c..195a0dcd232 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -177,6 +177,8 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
+                 [V3D_QPU_A_ITOF] = "itof",
+                 [V3D_QPU_A_CLZ] = "clz",
+                 [V3D_QPU_A_UTOF] = "utof",
++                [V3D_QPU_A_MOV] = "mov",
++                [V3D_QPU_A_FMOV] = "fmov",
+         };
+ 
+         if (op >= ARRAY_SIZE(op_names))
+@@ -458,6 +460,9 @@ static const uint8_t add_op_args[] = {
+         [V3D_QPU_A_ITOF] = D | A,
+         [V3D_QPU_A_CLZ] = D | A,
+         [V3D_QPU_A_UTOF] = D | A,
++
++        [V3D_QPU_A_MOV] = D | A,
++        [V3D_QPU_A_FMOV] = D | A,
+ };
+ 
+ static const uint8_t mul_op_args[] = {
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 50a69ce8c3a..c86a4119c54 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -227,6 +227,10 @@ enum v3d_qpu_add_op {
+         V3D_QPU_A_ITOF,
+         V3D_QPU_A_CLZ,
+         V3D_QPU_A_UTOF,
++
++        /* V3D 7.x */
++        V3D_QPU_A_FMOV,
++        V3D_QPU_A_MOV,
+ };
+ 
+ enum v3d_qpu_mul_op {
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 4045275cb9a..0e504e65fbf 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -776,6 +776,21 @@ static const struct opcode_desc add_ops_v71[] = {
+ 
+         { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+         { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
++
++        { 249, 249, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(12, 14), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(16, 18), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(20, 22), V3D_QPU_A_FMOV, 71 },
++        { 249, 249, .raddr_mask = OP_RANGE(24, 26), V3D_QPU_A_FMOV, 71 },
++
++        { 249, 249, .raddr_mask = OP_MASK(3),  V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(7),  V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(11), V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
++        { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
++
+ };
+ 
+ static const struct opcode_desc mul_ops_v71[] = {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch b/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch
new file mode 100644
index 0000000000..df5222700d
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0034-broadcom-qpu-fix-packing-unpacking-of-fmov-variants-.patch
@@ -0,0 +1,46 @@
+From 8dbbb7e22b694fdc62376d112b3dc6105d556c63 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 4 Oct 2021 13:07:35 +0200
+Subject: [PATCH 034/142] broadcom/qpu: fix packing/unpacking of fmov variants
+ for v71
+
+---
+ src/broadcom/qpu/qpu_pack.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 0e504e65fbf..0eb820b3f10 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -1405,9 +1405,9 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+                 break;
+ 
+         case V3D_QPU_M_FMOV:
+-                instr->alu.mul.output_pack = (raddr_d >> 2) & 1;
++                instr->alu.mul.output_pack = raddr_d & 0x3;
+ 
+-                if (!v3d_qpu_float32_unpack_unpack(raddr_d & 0x3,
++                if (!v3d_qpu_float32_unpack_unpack((raddr_d >> 2) & 0x7,
+                                                    &instr->alu.mul.a.unpack)) {
+                         return false;
+                 }
+@@ -2046,14 +2046,13 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                                                &packed)) {
+                         return false;
+                 }
+-                opcode |= (packed >> 1) & 1;
+-                raddr_d = (packed & 1) << 2;
++                raddr_d |= packed;
+ 
+                 if (!v3d_qpu_float32_unpack_pack(instr->alu.mul.a.unpack,
+                                                  &packed)) {
+                         return false;
+                 }
+-                raddr_d |= packed;
++                raddr_d |= packed << 2;
+                 break;
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch b/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch
new file mode 100644
index 0000000000..2e244c13dc
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0035-broadcom-qpu-implement-switch-rules-for-fmin-fmax-fa.patch
@@ -0,0 +1,107 @@
+From 63d0059ebef288afb0e2e746dadda8c2238bdfcb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 28 Sep 2021 01:17:08 +0200
+Subject: [PATCH 035/142] broadcom/qpu: implement switch rules for fmin/fmax
+ fadd/faddnf for v71
+
+They use the same opcodes, and switch between one and the other based
+on raddr.
+
+Note that the rule rule includes also if small_imm_a/b are used. That
+is still not in place so that part is hardcode. Would be updated later
+when small immediates support for v71 gets implemented.
+---
+ src/broadcom/qpu/qpu_pack.c | 48 +++++++++++++++++++++++++++++++++++++
+ 1 file changed, 48 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 0eb820b3f10..7a262f18ac3 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -651,7 +651,9 @@ static const struct opcode_desc mul_ops_v33[] = {
+  * opcodes that changed on v71
+  */
+ static const struct opcode_desc add_ops_v71[] = {
++        /* FADD is FADDNF depending on the order of the raddr_a/raddr_b. */
+         { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADD },
++        { 0,   47,  .raddr_mask = ANYOPMASK, V3D_QPU_A_FADDNF },
+         { 53,  55,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+         { 56,  56,  .raddr_mask = ANYOPMASK, V3D_QPU_A_ADD },
+         { 57,  59,  .raddr_mask = ANYOPMASK, V3D_QPU_A_VFPACK },
+@@ -666,6 +668,10 @@ static const struct opcode_desc add_ops_v71[] = {
+         { 125, 125, .raddr_mask = ANYOPMASK, V3D_QPU_A_SHR },
+         { 126, 126, .raddr_mask = ANYOPMASK, V3D_QPU_A_ASR },
+         { 127, 127, .raddr_mask = ANYOPMASK, V3D_QPU_A_ROR },
++        /* FMIN is instead FMAX depending on the raddr_a/b order. */
++        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMIN },
++        { 128, 175, .raddr_mask = ANYOPMASK, V3D_QPU_A_FMAX },
++        { 176, 180, .raddr_mask = ANYOPMASK, V3D_QPU_A_VFMIN },
+ 
+         { 181, 181, .raddr_mask = ANYOPMASK, V3D_QPU_A_AND },
+         { 182, 182, .raddr_mask = ANYOPMASK, V3D_QPU_A_OR },
+@@ -1162,6 +1168,22 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+ 
+         instr->alu.add.op = desc->op;
+ 
++        /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
++         * operands.
++         */
++        /* FIXME: for now hardcoded values, until we got the small_imm support
++         * in place
++         */
++        uint32_t small_imm_a = 0;
++        uint32_t small_imm_b = 0;
++        if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
++            small_imm_b *256 + (op & 3) * 64 + raddr_b) {
++                if (instr->alu.add.op == V3D_QPU_A_FMIN)
++                        instr->alu.add.op = V3D_QPU_A_FMAX;
++                if (instr->alu.add.op == V3D_QPU_A_FADD)
++                        instr->alu.add.op = V3D_QPU_A_FADDNF;
++        }
++
+         /* Some QPU ops require a bit more than just basic opcode and mux a/b
+          * comparisons to distinguish them.
+          */
+@@ -1754,6 +1776,11 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 uint32_t output_pack;
+                 uint32_t a_unpack;
+                 uint32_t b_unpack;
++                /* FIXME: for now hardcoded values, until we got the small_imm
++                 * support in place
++                 */
++                uint32_t small_imm_a = 0;
++                uint32_t small_imm_b = 0;
+ 
+                 if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+                         if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+@@ -1773,6 +1800,27 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                         return false;
+                 }
+ 
++                /* These operations with commutative operands are
++                 * distinguished by which order their operands come in.
++                 */
++                bool ordering =
++                        small_imm_a * 256 + a_unpack * 64 + raddr_a >
++                        small_imm_b * 256 + b_unpack * 64 + raddr_b;
++                if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
++                      instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
++                    ((instr->alu.add.op == V3D_QPU_A_FMAX ||
++                      instr->alu.add.op == V3D_QPU_A_FADDNF) && !ordering)) {
++                        uint32_t temp;
++
++                        temp = a_unpack;
++                        a_unpack = b_unpack;
++                        b_unpack = temp;
++
++                        temp = raddr_a;
++                        raddr_a = raddr_b;
++                        raddr_b = temp;
++                }
++
+                 opcode |= a_unpack << 2;
+                 opcode |= b_unpack << 0;
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch b/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch
new file mode 100644
index 0000000000..6c80d4a9ab
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0036-broadcom-compiler-make-vir_write_rX-return-false-on-.patch
@@ -0,0 +1,37 @@
+From c9f6faa3ddc91024b3d9dc67ce2221187daac128 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 11:54:18 +0200
+Subject: [PATCH 036/142] broadcom/compiler: make vir_write_rX return false on
+ platforms without accums
+
+---
+ src/broadcom/compiler/vir.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index 007cb0a941b..d75cd777b6d 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -158,6 +158,9 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
+ bool
+ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         for (int i = 0; i < vir_get_nsrc(inst); i++) {
+                 switch (inst->src[i].file) {
+                 case QFILE_VPM:
+@@ -180,6 +183,9 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+ bool
+ vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
+ {
++        if (!devinfo->has_accumulators)
++                return false;
++
+         switch (inst->dst.file) {
+         case QFILE_MAGIC:
+                 switch (inst->dst.index) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch b/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch
new file mode 100644
index 0000000000..1dea74a300
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0037-broadcom-compiler-rename-vir_writes_rX-to-vir_writes.patch
@@ -0,0 +1,77 @@
+From 3d16229743e26b58735ed049ee982073f6034342 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 12:03:50 +0200
+Subject: [PATCH 037/142] broadcom/compiler: rename vir_writes_rX to
+ vir_writes_rX_implicitly
+
+Since that represents more accurately what they check..
+---
+ src/broadcom/compiler/v3d_compiler.h          | 4 ++--
+ src/broadcom/compiler/vir.c                   | 6 ++++--
+ src/broadcom/compiler/vir_register_allocate.c | 4 ++--
+ 3 files changed, 8 insertions(+), 6 deletions(-)
+
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index eb4e692464b..7e8f3bfc1a7 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -1149,8 +1149,8 @@ bool vir_is_raw_mov(struct qinst *inst);
+ bool vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst);
+ bool vir_is_add(struct qinst *inst);
+ bool vir_is_mul(struct qinst *inst);
+-bool vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst);
+-bool vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst);
++bool vir_writes_r3_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
++bool vir_writes_r4_implicitly(const struct v3d_device_info *devinfo, struct qinst *inst);
+ struct qreg vir_follow_movs(struct v3d_compile *c, struct qreg reg);
+ uint8_t vir_channels_written(struct qinst *inst);
+ struct qreg ntq_get_src(struct v3d_compile *c, nir_src src, int i);
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index d75cd777b6d..aea113f050e 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -156,7 +156,8 @@ vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
+ }
+ 
+ bool
+-vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
++vir_writes_r3_implicitly(const struct v3d_device_info *devinfo,
++                         struct qinst *inst)
+ {
+         if (!devinfo->has_accumulators)
+                 return false;
+@@ -181,7 +182,8 @@ vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
+ }
+ 
+ bool
+-vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
++vir_writes_r4_implicitly(const struct v3d_device_info *devinfo,
++                         struct qinst *inst)
+ {
+         if (!devinfo->has_accumulators)
+                 return false;
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 7b3f6c41934..f2df35cd458 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -988,7 +988,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+          * result to a temp), nothing else can be stored in r3/r4 across
+          * it.
+          */
+-        if (vir_writes_r3(c->devinfo, inst)) {
++        if (vir_writes_r3_implicitly(c->devinfo, inst)) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+@@ -998,7 +998,7 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                 }
+         }
+ 
+-        if (vir_writes_r4(c->devinfo, inst)) {
++        if (vir_writes_r4_implicitly(c->devinfo, inst)) {
+                 for (int i = 0; i < c->num_temps; i++) {
+                         if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
+                                 ra_add_node_interference(c->g,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch b/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch
new file mode 100644
index 0000000000..b39e7bda94
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0038-broadcom-compiler-only-handle-accumulator-classes-if.patch
@@ -0,0 +1,170 @@
+From 83fae160491737e8568b8fb5eaa5be4d2c8bf3c8 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 12:10:31 +0200
+Subject: [PATCH 038/142] broadcom/compiler: only handle accumulator classes if
+ present
+
+---
+ src/broadcom/compiler/vir_register_allocate.c | 77 ++++++++++++-------
+ 1 file changed, 49 insertions(+), 28 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index f2df35cd458..e78ccb7c6aa 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -53,6 +53,17 @@ get_class_bit_any(const struct v3d_device_info *devinfo)
+         else
+                 return CLASS_BITS_PHYS;
+ }
++
++static uint8_t
++filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
++{
++   if (!devinfo->has_accumulators) {
++      assert(class_bits & CLASS_BITS_PHYS);
++      class_bits = CLASS_BITS_PHYS;
++   }
++   return class_bits;
++}
++
+ static inline uint32_t
+ temp_to_node(struct v3d_compile *c, uint32_t temp)
+ {
+@@ -413,8 +424,10 @@ v3d_setup_spill_base(struct v3d_compile *c)
+                  */
+                 if (c->spilling) {
+                         int temp_class = CLASS_BITS_PHYS;
+-                        if (i != c->spill_base.index)
++                        if (c->devinfo->has_accumulators &&
++                            i != c->spill_base.index) {
+                                 temp_class |= CLASS_BITS_ACC;
++                        }
+                         add_node(c, i, temp_class);
+                 }
+         }
+@@ -473,14 +486,16 @@ v3d_emit_spill_tmua(struct v3d_compile *c,
+          * temp will be used immediately so just like the uniform above we
+          * can allow accumulators.
+          */
++        int temp_class =
++                filter_class_bits(c->devinfo, CLASS_BITS_PHYS | CLASS_BITS_ACC);
+         if (!fill_dst) {
+                 struct qreg dst = vir_TMUWT(c);
+                 assert(dst.file == QFILE_TEMP);
+-                add_node(c, dst.index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
++                add_node(c, dst.index, temp_class);
+         } else {
+                 *fill_dst = vir_LDTMU(c);
+                 assert(fill_dst->file == QFILE_TEMP);
+-                add_node(c, fill_dst->index, CLASS_BITS_PHYS | CLASS_BITS_ACC);
++                add_node(c, fill_dst->index, temp_class);
+         }
+ 
+         /* Temps across the thread switch we injected can't be assigned to
+@@ -662,8 +677,10 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                                          * instruction immediately after so we
+                                          * can use ACC.
+                                          */
+-                                        add_node(c, temp.index, CLASS_BITS_PHYS |
+-                                                                CLASS_BITS_ACC);
++                                        int temp_class =
++                                                filter_class_bits(c->devinfo, CLASS_BITS_PHYS |
++                                                                              CLASS_BITS_ACC);
++                                        add_node(c, temp.index, temp_class);
+                                 } else {
+                                         /* If we have a postponed spill, we
+                                          * don't need a fill as the temp would
+@@ -941,6 +958,7 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+                 compiler->reg_class_phys[threads] =
+                         ra_alloc_contig_reg_class(compiler->regs, 1);
+ 
++                /* Init physical regs */
+                 for (int i = phys_index;
+                      i < phys_index + (PHYS_COUNT >> threads); i++) {
+                         if (compiler->devinfo->has_accumulators)
+@@ -949,16 +967,15 @@ vir_init_reg_sets(struct v3d_compiler *compiler)
+                         ra_class_add_reg(compiler->reg_class_any[threads], i);
+                 }
+ 
++                /* Init accumulator regs */
+                 if (compiler->devinfo->has_accumulators) {
+                         for (int i = ACC_INDEX + 0; i < ACC_INDEX + ACC_COUNT - 1; i++) {
+                                 ra_class_add_reg(compiler->reg_class_phys_or_acc[threads], i);
+                                 ra_class_add_reg(compiler->reg_class_any[threads], i);
+                         }
+-                }
+-                /* r5 can only store a single 32-bit value, so not much can
+-                 * use it.
+-                 */
+-                if (compiler->devinfo->has_accumulators) {
++                        /* r5 can only store a single 32-bit value, so not much can
++                         * use it.
++                         */
+                         ra_class_add_reg(compiler->reg_class_r5[threads],
+                                          ACC_INDEX + 5);
+                         ra_class_add_reg(compiler->reg_class_any[threads],
+@@ -1081,21 +1098,23 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                  * because ldunif has usually a shorter lifespan, allowing for
+                  * more accumulator reuse and QPU merges.
+                  */
+-                if (!inst->qpu.sig.ldunif) {
+-                        uint8_t class_bits =
+-                                get_temp_class_bits(c, inst->dst.index) &
+-                                ~CLASS_BITS_R5;
+-                        set_temp_class_bits(c, inst->dst.index,
+-                                            class_bits);
+-
+-                } else {
+-                        /* Until V3D 4.x, we could only load a uniform
+-                         * to r5, so we'll need to spill if uniform
+-                         * loads interfere with each other.
+-                         */
+-                        if (c->devinfo->ver < 40) {
++                if (c->devinfo->has_accumulators) {
++                        if (!inst->qpu.sig.ldunif) {
++                                uint8_t class_bits =
++                                        get_temp_class_bits(c, inst->dst.index) &
++                                        ~CLASS_BITS_R5;
+                                 set_temp_class_bits(c, inst->dst.index,
+-                                                    CLASS_BITS_R5);
++                                                    class_bits);
++
++                        } else {
++                                /* Until V3D 4.x, we could only load a uniform
++                                 * to r5, so we'll need to spill if uniform
++                                 * loads interfere with each other.
++                                 */
++                                if (c->devinfo->ver < 40) {
++                                        set_temp_class_bits(c, inst->dst.index,
++                                                            CLASS_BITS_R5);
++                                }
+                         }
+                 }
+         }
+@@ -1152,8 +1171,10 @@ v3d_register_allocate(struct v3d_compile *c)
+                         c->thread_index--;
+         }
+ 
+-        c->g = ra_alloc_interference_graph(c->compiler->regs,
+-                                           c->num_temps + ARRAY_SIZE(acc_nodes));
++        unsigned num_ra_nodes = c->num_temps;
++        if (c->devinfo->has_accumulators)
++                num_ra_nodes += ARRAY_SIZE(acc_nodes);
++        c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
+ 
+         /* Make some fixed nodes for the accumulators, which we will need to
+@@ -1162,8 +1183,8 @@ v3d_register_allocate(struct v3d_compile *c)
+          * live in, but the classes take up a lot of memory to set up, so we
+          * don't want to make too many.
+          */
+-        for (uint32_t i = 0; i < ACC_COUNT + c->num_temps; i++) {
+-                if (i < ACC_COUNT) {
++        for (uint32_t i = 0; i < num_ra_nodes; i++) {
++                if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                         acc_nodes[i] = i;
+                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+                         c->nodes.info[i].priority = 0;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch
new file mode 100644
index 0000000000..e7553a8295
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0039-broadcom-compiler-don-t-assign-rf0-to-temps-across-i.patch
@@ -0,0 +1,187 @@
+From fd77cc3204e7c69927f97ce2a1d55d2a47d77a27 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 12:14:04 +0200
+Subject: [PATCH 039/142] broadcom/compiler: don't assign rf0 to temps across
+ implicit rf0 writes
+
+In platforms that don't have accumulators and have implicit writes to
+the register file we need to be careful and avoid assigning a physical
+register to a temp that lives across an implicit write to that same
+physical register.
+
+For now, we have the case of implicit writes to rf0 from various
+signals, but it should be easy to extend this to include additional
+registers if needed.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++----
+ 1 file changed, 57 insertions(+), 12 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index e78ccb7c6aa..e0adc1de7a4 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -29,6 +29,9 @@
+ #define ACC_INDEX     0
+ #define ACC_COUNT     6
+ 
++/* RA nodes used to track RF registers with implicit writes */
++#define IMPLICIT_RF_COUNT 1
++
+ #define PHYS_COUNT 64
+ 
+ static uint8_t
+@@ -67,15 +70,17 @@ filter_class_bits(const struct v3d_device_info *devinfo, uint8_t class_bits)
+ static inline uint32_t
+ temp_to_node(struct v3d_compile *c, uint32_t temp)
+ {
+-        return temp + (c->devinfo->has_accumulators ? ACC_COUNT : 0);
++        return temp + (c->devinfo->has_accumulators ? ACC_COUNT :
++                                                      IMPLICIT_RF_COUNT);
+ }
+ 
+ static inline uint32_t
+ node_to_temp(struct v3d_compile *c, uint32_t node)
+ {
+         assert((c->devinfo->has_accumulators && node >= ACC_COUNT) ||
+-               (!c->devinfo->has_accumulators && node >= 0));
+-        return node - (c->devinfo->has_accumulators ? ACC_COUNT : 0);
++               (!c->devinfo->has_accumulators && node >= IMPLICIT_RF_COUNT));
++        return node - (c->devinfo->has_accumulators ? ACC_COUNT :
++                                                      IMPLICIT_RF_COUNT);
+ }
+ 
+ static inline uint8_t
+@@ -360,7 +365,8 @@ ensure_nodes(struct v3d_compile *c)
+         c->nodes.info = reralloc_array_size(c,
+                                             c->nodes.info,
+                                             sizeof(c->nodes.info[0]),
+-                                            c->nodes.alloc_count + ACC_COUNT);
++                                            c->nodes.alloc_count +
++                                            MAX2(ACC_COUNT, IMPLICIT_RF_COUNT));
+ }
+ 
+ /* Creates the interference node for a new temp. We use this to keep the node
+@@ -372,7 +378,8 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+         ensure_nodes(c);
+ 
+         int node = ra_add_node(c->g, choose_reg_class(c, class_bits));
+-        assert(node == temp + ACC_COUNT);
++        assert(c->devinfo->has_accumulators ? node == temp + ACC_COUNT :
++                                              node == temp + IMPLICIT_RF_COUNT);
+ 
+         /* We fill the node priority after we are done inserting spills */
+         c->nodes.info[node].class_bits = class_bits;
+@@ -995,7 +1002,9 @@ tmu_spilling_allowed(struct v3d_compile *c)
+ }
+ 
+ static void
+-update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
++update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
++                                      int *acc_nodes,
++                                      int *implicit_rf_nodes,
+                                       struct qinst *inst)
+ {
+         int32_t ip = inst->ip;
+@@ -1025,6 +1034,19 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                 }
+         }
+ 
++        /* If any instruction writes to a physical register implicitly
++         * nothing else can write the same register across it.
++         */
++        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
++                for (int i = 0; i < c->num_temps; i++) {
++                        if (c->temp_start[i] < ip && c->temp_end[i] > ip) {
++                                ra_add_node_interference(c->g,
++                                                         temp_to_node(c, i),
++                                                         implicit_rf_nodes[0]);
++                        }
++                }
++        }
++
+         if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
+                 switch (inst->qpu.alu.add.op) {
+                 case V3D_QPU_A_LDVPMV_IN:
+@@ -1116,6 +1138,16 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c, int *acc_nodes,
+                                                             CLASS_BITS_R5);
+                                 }
+                         }
++                } else {
++                        /* If the instruction has an implicit write
++                         * we can't allocate its dest to the same
++                         * register.
++                         */
++                        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
++                                ra_add_node_interference(c->g,
++                                                         temp_to_node(c, inst->dst.index),
++                                                         implicit_rf_nodes[0]);
++                        }
+                 }
+         }
+ 
+@@ -1139,10 +1171,18 @@ struct qpu_reg *
+ v3d_register_allocate(struct v3d_compile *c)
+ {
+         int acc_nodes[ACC_COUNT];
++        int implicit_rf_nodes[IMPLICIT_RF_COUNT];
++
++        unsigned num_ra_nodes = c->num_temps;
++        if (c->devinfo->has_accumulators)
++                num_ra_nodes += ARRAY_SIZE(acc_nodes);
++        else
++                num_ra_nodes += ARRAY_SIZE(implicit_rf_nodes);
++
+         c->nodes = (struct v3d_ra_node_info) {
+                 .alloc_count = c->num_temps,
+                 .info = ralloc_array_size(c, sizeof(c->nodes.info[0]),
+-                                          c->num_temps + ACC_COUNT),
++                                          num_ra_nodes),
+         };
+ 
+         uint32_t phys_index = get_phys_index(c->devinfo);
+@@ -1171,9 +1211,6 @@ v3d_register_allocate(struct v3d_compile *c)
+                         c->thread_index--;
+         }
+ 
+-        unsigned num_ra_nodes = c->num_temps;
+-        if (c->devinfo->has_accumulators)
+-                num_ra_nodes += ARRAY_SIZE(acc_nodes);
+         c->g = ra_alloc_interference_graph(c->compiler->regs, num_ra_nodes);
+         ra_set_select_reg_callback(c->g, v3d_ra_select_callback, &callback_data);
+ 
+@@ -1181,7 +1218,8 @@ v3d_register_allocate(struct v3d_compile *c)
+          * interfere with when ops have implied r3/r4 writes or for the thread
+          * switches.  We could represent these as classes for the nodes to
+          * live in, but the classes take up a lot of memory to set up, so we
+-         * don't want to make too many.
++         * don't want to make too many. We use the same mechanism on platforms
++         * without accumulators that can have implicit writes to phys regs.
+          */
+         for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+@@ -1189,6 +1227,12 @@ v3d_register_allocate(struct v3d_compile *c)
+                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+                         c->nodes.info[i].priority = 0;
+                         c->nodes.info[i].class_bits = 0;
++                } else if (!c->devinfo->has_accumulators &&
++                           i < ARRAY_SIZE(implicit_rf_nodes)) {
++                        implicit_rf_nodes[i] = i;
++                        ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
++                        c->nodes.info[i].priority = 0;
++                        c->nodes.info[i].class_bits = 0;
+                 } else {
+                         uint32_t t = node_to_temp(c, i);
+                         c->nodes.info[i].priority =
+@@ -1204,7 +1248,8 @@ v3d_register_allocate(struct v3d_compile *c)
+         int ip = 0;
+         vir_for_each_inst_inorder(inst, c) {
+                 inst->ip = ip++;
+-                update_graph_and_reg_classes_for_inst(c, acc_nodes, inst);
++                update_graph_and_reg_classes_for_inst(c, acc_nodes,
++                                                      implicit_rf_nodes, inst);
+         }
+ 
+         /* Set the register classes for all our temporaries in the graph */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch
new file mode 100644
index 0000000000..8eee3ac26c
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0040-broadcom-compiler-CS-payload-registers-have-changed-.patch
@@ -0,0 +1,33 @@
+From 9a08ae9f354a6da6d9d71b87800aca8b3df49e29 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 13:37:28 +0200
+Subject: [PATCH 040/142] broadcom/compiler: CS payload registers have changed
+ in v71
+
+---
+ src/broadcom/compiler/nir_to_vir.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index 1a05b279a2d..220ff6bcd49 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -4362,8 +4362,13 @@ nir_to_vir(struct v3d_compile *c)
+                                                       V3D_QPU_WADDR_SYNC));
+                 }
+ 
+-                c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
+-                c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                if (c->devinfo->ver <= 42) {
++                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0));
++                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                } else if (c->devinfo->ver >= 71) {
++                        c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 3));
++                        c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2));
++                }
+ 
+                 /* Set up the division between gl_LocalInvocationIndex and
+                  * wg_in_mem in the payload reg.
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch b/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch
new file mode 100644
index 0000000000..193468668e
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0041-broadcom-compiler-don-t-schedule-rf0-writes-right-af.patch
@@ -0,0 +1,46 @@
+From 5477884196cb54a71f54fa6cad42c6d3326bde88 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 22 Oct 2021 13:39:48 +0200
+Subject: [PATCH 041/142] broadcom/compiler: don't schedule rf0 writes right
+ after ldvary
+
+ldvary writes rf0 implicitly on the next cycle so they would clash.
+This case is not handled correctly by our normal dependency tracking,
+which doesn't know anything about delayed writes from instructions
+and thinks the rf0 write happens on the same cycle ldvary is emitted.
+
+Fixes (v71):
+dEQP-VK.glsl.conversions.matrix_to_matrix.mat2x3_to_mat4x2_fragment
+---
+ src/broadcom/compiler/qpu_schedule.c | 15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 839c0c62315..870823fd2b1 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -652,6 +652,21 @@ writes_too_soon_after_write(const struct v3d_device_info *devinfo,
+             v3d_qpu_writes_r4(devinfo, inst))
+                 return true;
+ 
++        if (devinfo->ver <= 42)
++           return false;
++
++        /* Don't schedule anything that writes rf0 right after ldvary, since
++         * that would clash with the ldvary's delayed rf0 write (the exception
++         * is another ldvary, since its implicit rf0 write would also have
++         * one cycle of delay and would not clash).
++         */
++        if (scoreboard->last_ldvary_tick + 1 == scoreboard->tick &&
++            (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
++             (v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
++              !inst->sig.ldvary))) {
++            return true;
++       }
++
+         return false;
+ }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch
new file mode 100644
index 0000000000..dcb28320d5
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0042-broadcom-compiler-allow-instruction-merges-in-v71.patch
@@ -0,0 +1,60 @@
+From 31623712c2f741d393767641f32d56c35150eda5 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 30 Sep 2021 13:22:48 +0200
+Subject: [PATCH 042/142] broadcom/compiler: allow instruction merges in v71
+
+In v3d 4.x there were restrictions based on the number of raddrs used
+by the combined instructions, but we don't have these restrictions in
+v3d 7.x.
+
+It should be noted that while there are no restrictions on the number
+of raddrs addressed, a QPU instruction can only address a single small
+immediate, so we should be careful about that when we add support for
+small immediates.
+---
+ src/broadcom/compiler/qpu_schedule.c | 21 +++++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 870823fd2b1..ff544fb3c1c 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -906,8 +906,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
+ static bool
+ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+                  const struct v3d_qpu_instr *add_instr,
+-                 const struct v3d_qpu_instr *mul_instr)
++                 const struct v3d_qpu_instr *mul_instr,
++                 const struct v3d_device_info *devinfo)
+ {
++        assert(devinfo->ver <= 42);
++
+         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
+         int naddrs = util_bitcount64(raddrs_used);
+ 
+@@ -1111,9 +1114,19 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                 add_instr = a;
+         }
+ 
+-        if (add_instr && mul_instr &&
+-            !qpu_merge_raddrs(&merge, add_instr, mul_instr)) {
+-                        return false;
++        /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
++         * they have restrictions on the number of raddrs that can be adressed
++         * in a single instruction.
++         *
++         * FIXME: for V3D 7.x we can't merge instructions if they address more
++         * than one small immediate. For now, we don't support small immediates,
++         * so it is not a problem.
++         */
++        if (devinfo->ver <= 42) {
++                if (add_instr && mul_instr &&
++                    !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
++                                return false;
++                }
+         }
+ 
+         merge.sig.thrsw |= b->sig.thrsw;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch b/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch
new file mode 100644
index 0000000000..1df473d3de
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0043-broadcom-qpu-add-MOV-integer-packing-unpacking-varia.patch
@@ -0,0 +1,172 @@
+From 959a0128654c94d84fda53ffc108971d3b3a817a Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 6 Oct 2021 09:27:43 +0200
+Subject: [PATCH 043/142] broadcom/qpu: add MOV integer packing/unpacking
+ variants
+
+These are new in v71 and cover MOV on both the ADD and the MUL alus.
+---
+ src/broadcom/qpu/qpu_instr.h |  9 ++++
+ src/broadcom/qpu/qpu_pack.c  | 98 ++++++++++++++++++++++++++++++++++++
+ 2 files changed, 107 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index c86a4119c54..4b34d17bd4c 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -285,6 +285,15 @@ enum v3d_qpu_input_unpack {
+ 
+         /** Swap high and low 16 bits */
+         V3D_QPU_UNPACK_SWAP_16,
++
++        /** Convert low 16 bits from 16-bit integer to unsigned 32-bit int */
++        V3D_QPU_UNPACK_UL,
++        /** Convert high 16 bits from 16-bit integer to unsigned 32-bit int */
++        V3D_QPU_UNPACK_UH,
++        /** Convert low 16 bits from 16-bit integer to signed 32-bit int */
++        V3D_QPU_UNPACK_IL,
++        /** Convert high 16 bits from 16-bit integer to signed 32-bit int */
++        V3D_QPU_UNPACK_IH,
+ };
+ 
+ enum v3d_qpu_mux {
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 7a262f18ac3..4d677894755 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -922,6 +922,56 @@ v3d_qpu_float32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
+         }
+ }
+ 
++static bool
++v3d_qpu_int32_unpack_unpack(uint32_t packed,
++                            enum v3d_qpu_input_unpack *unpacked)
++{
++        switch (packed) {
++        case 0:
++                *unpacked = V3D_QPU_UNPACK_NONE;
++                return true;
++        case 1:
++                *unpacked = V3D_QPU_UNPACK_UL;
++                return true;
++        case 2:
++                *unpacked = V3D_QPU_UNPACK_UH;
++                return true;
++        case 3:
++                *unpacked = V3D_QPU_UNPACK_IL;
++                return true;
++        case 4:
++                *unpacked = V3D_QPU_UNPACK_IH;
++                return true;
++        default:
++                return false;
++        }
++}
++
++static bool
++v3d_qpu_int32_unpack_pack(enum v3d_qpu_input_unpack unpacked,
++                          uint32_t *packed)
++{
++        switch (unpacked) {
++        case V3D_QPU_UNPACK_NONE:
++                *packed = 0;
++                return true;
++        case V3D_QPU_UNPACK_UL:
++                *packed = 1;
++                return true;
++        case V3D_QPU_UNPACK_UH:
++                *packed = 2;
++                return true;
++        case V3D_QPU_UNPACK_IL:
++                *packed = 3;
++                return true;
++        case V3D_QPU_UNPACK_IH:
++                *packed = 4;
++                return true;
++        default:
++                return false;
++        }
++}
++
+ static bool
+ v3d_qpu_float16_unpack_unpack(uint32_t packed,
+                               enum v3d_qpu_input_unpack *unpacked)
+@@ -1273,6 +1323,15 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+                 instr->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+                 break;
+ 
++        case V3D_QPU_A_MOV:
++                instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_int32_unpack_unpack((raddr_b >> 2) & 0x7,
++                                                 &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
+         default:
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                 instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+@@ -1449,6 +1508,15 @@ v3d71_qpu_mul_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+ 
+                 break;
+ 
++        case V3D_QPU_M_MOV:
++                instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++
++                if (!v3d_qpu_int32_unpack_unpack((raddr_d >> 2) & 0x7,
++                                                 &instr->alu.mul.a.unpack)) {
++                        return false;
++                }
++                break;
++
+         default:
+                 instr->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+                 instr->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+@@ -1909,6 +1977,21 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 opcode |= packed;
+                 break;
+ 
++        case V3D_QPU_A_MOV: {
++                uint32_t packed;
++
++                if (instr->alu.add.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_int32_unpack_pack(instr->alu.add.a.unpack,
++                                               &packed)) {
++                        return false;
++                }
++
++                raddr_b |= packed << 2;
++                break;
++        }
++
+         default:
+                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+@@ -2126,6 +2209,21 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+                 break;
+         }
+ 
++        case V3D_QPU_M_MOV: {
++                uint32_t packed;
++
++                if (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE)
++                        return false;
++
++                if (!v3d_qpu_int32_unpack_pack(instr->alu.mul.a.unpack,
++                                               &packed)) {
++                        return false;
++                }
++
++                raddr_d |= packed << 2;
++                break;
++        }
++
+         default:
+                 break;
+         }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch b/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch
new file mode 100644
index 0000000000..864966dbea
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0044-broadcom-qpu-fail-packing-on-unhandled-mul-pack-unpa.patch
@@ -0,0 +1,47 @@
+From 2e86dd0c357d7b432ce6794ae22fbfae89ad186b Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 6 Oct 2021 12:01:10 +0200
+Subject: [PATCH 044/142] broadcom/qpu: fail packing on unhandled mul
+ pack/unpack
+
+We are doing this for the ADD alu already and it may be helpful to
+identify cases where we have QPU code with pack/unpack modifiers on
+MUL opcodes that we then are not packing into the actual QPU
+instructions.
+---
+ src/broadcom/qpu/qpu_pack.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 4d677894755..180d7ab08a3 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -2106,6 +2106,12 @@ v3d33_qpu_mul_pack(const struct v3d_device_info *devinfo,
+         }
+ 
+         default:
++                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
++                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
+                 break;
+         }
+ 
+@@ -2225,6 +2231,12 @@ v3d71_qpu_mul_pack(const struct v3d_device_info *devinfo,
+         }
+ 
+         default:
++                if (instr->alu.mul.op != V3D_QPU_M_NOP &&
++                    (instr->alu.mul.output_pack != V3D_QPU_PACK_NONE ||
++                     instr->alu.mul.a.unpack != V3D_QPU_UNPACK_NONE ||
++                     instr->alu.mul.b.unpack != V3D_QPU_UNPACK_NONE)) {
++                        return false;
++                }
+                 break;
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch b/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch
new file mode 100644
index 0000000000..cc4befe719
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0045-broadcom-compiler-generalize-check-for-shaders-using.patch
@@ -0,0 +1,30 @@
+From ed6bfa29d43b5a89ff070961454f1e82e23b4f45 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 8 Oct 2021 15:10:24 +0200
+Subject: [PATCH 045/142] broadcom/compiler: generalize check for shaders using
+ pixel center W
+
+V3D 4.x has pixel center W in rf0 and V3D 7.x has it in rf3. We already
+account for this when we setup the c->payload_w, so use that.
+---
+ src/broadcom/compiler/nir_to_vir.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index 220ff6bcd49..90fe1d1e7f0 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -4547,8 +4547,8 @@ vir_check_payload_w(struct v3d_compile *c)
+ 
+         vir_for_each_inst_inorder(inst, c) {
+                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
+-                        if (inst->src[i].file == QFILE_REG &&
+-                            inst->src[i].index == 0) {
++                        if (inst->src[i].file == c->payload_w.file &&
++                            inst->src[i].index == c->payload_w.index) {
+                                 c->uses_center_w = true;
+                                 return;
+                         }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch b/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch
new file mode 100644
index 0000000000..23f70c60d3
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0046-broadcom-compiler-v71-isn-t-affected-by-double-round.patch
@@ -0,0 +1,34 @@
+From e1a0fa2c2010ef29b8cec798cd0fc99cf44f3a2d Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 14 Oct 2021 14:16:40 +0200
+Subject: [PATCH 046/142] broadcom/compiler: v71 isn't affected by
+ double-rounding of viewport X,Y coords
+
+---
+ src/broadcom/compiler/v3d_nir_lower_io.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/compiler/v3d_nir_lower_io.c b/src/broadcom/compiler/v3d_nir_lower_io.c
+index 3ef0e398228..4cdba3748a1 100644
+--- a/src/broadcom/compiler/v3d_nir_lower_io.c
++++ b/src/broadcom/compiler/v3d_nir_lower_io.c
+@@ -600,9 +600,13 @@ v3d_nir_emit_ff_vpm_outputs(struct v3d_compile *c, nir_builder *b,
+                          * The correct fix for this as recommended by Broadcom
+                          * is to convert to .8 fixed-point with ffloor().
+                          */
+-                        pos = nir_f2i32(b, nir_ffloor(b, pos));
+-                        v3d_nir_store_output(b, state->vp_vpm_offset + i,
+-                                             offset_reg, pos);
++                        if (c->devinfo->ver <= 42)
++                                 pos = nir_f2i32(b, nir_ffloor(b, pos));
++                        else
++                                 pos = nir_f2i32(b, nir_fround_even(b, pos));
++
++                       v3d_nir_store_output(b, state->vp_vpm_offset + i,
++                                            offset_reg, pos);
+                 }
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch
new file mode 100644
index 0000000000..45dd5fba46
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0047-broadcom-compiler-update-one-TMUWT-restriction-for-v.patch
@@ -0,0 +1,31 @@
+From 697e6cf01b781b244404872f331a778b6d4e67da Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 19 Oct 2021 11:16:43 +0200
+Subject: [PATCH 047/142] broadcom/compiler: update one TMUWT restriction for
+ v71
+
+TMUWT not allowed in the final instruction restriction doesn't apply
+for v71.
+---
+ src/broadcom/compiler/qpu_schedule.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index ff544fb3c1c..25f79aa6f46 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1700,8 +1700,10 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+ 
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 /* GFXH-1625: TMUWT not allowed in the final instruction. */
+-                if (slot == 2 && inst->alu.add.op == V3D_QPU_A_TMUWT)
++                if (c->devinfo->ver <= 42 && slot == 2 &&
++                    inst->alu.add.op == V3D_QPU_A_TMUWT) {
+                         return false;
++                }
+ 
+                 /* No writing physical registers at the end. */
+                 bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch
new file mode 100644
index 0000000000..75d16def81
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0048-broadcom-compiler-update-ldunif-ldvary-comment-for-v.patch
@@ -0,0 +1,37 @@
+From 26fea727a9f34b75a3fe3f6a806accaddcc317f6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 19 Oct 2021 11:51:32 +0200
+Subject: [PATCH 048/142] broadcom/compiler: update ldunif/ldvary comment for
+ v71
+
+For v42 and below ldunif/ldvary write both on r5, but with a different
+delay, so we need to take that into account when scheduling both.
+
+For v71 the register used is rf0, but the behaviour is the same. So
+the scheduling code can be the same, but the comment needs update.
+---
+ src/broadcom/compiler/qpu_schedule.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 25f79aa6f46..e8197661f89 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1234,10 +1234,11 @@ retry:
+                 if (pixel_scoreboard_too_soon(c, scoreboard, inst))
+                         continue;
+ 
+-                /* ldunif and ldvary both write r5, but ldunif does so a tick
+-                 * sooner.  If the ldvary's r5 wasn't used, then ldunif might
++                /* ldunif and ldvary both write the same register (r5 for v42
++                 * and below, rf0 for v71), but ldunif does so a tick sooner.
++                 * If the ldvary's register wasn't used, then ldunif might
+                  * otherwise get scheduled so ldunif and ldvary try to update
+-                 * r5 in the same tick.
++                 * the register in the same tick.
+                  */
+                 if ((inst->sig.ldunif || inst->sig.ldunifa) &&
+                     scoreboard->tick == scoreboard->last_ldvary_tick + 1) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch b/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch
new file mode 100644
index 0000000000..b66dc181f4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0049-broadcom-compiler-update-payload-registers-handling-.patch
@@ -0,0 +1,52 @@
+From 70456e27b039174f767010f96d9b649e5e42d84f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 19 Oct 2021 23:52:30 +0200
+Subject: [PATCH 049/142] broadcom/compiler: update payload registers handling
+ when computing live intervals
+
+As for v71 the payload registers are not the same. Specifically now
+rf3 is used as payload register, so this is needed to avoid rf3 being
+selected as a instruction dst by the register allocator, overwriting
+the payload value that could be still used.
+---
+ src/broadcom/compiler/vir_live_variables.c | 21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_live_variables.c b/src/broadcom/compiler/vir_live_variables.c
+index 575b0481dc8..87a7e2b5b81 100644
+--- a/src/broadcom/compiler/vir_live_variables.c
++++ b/src/broadcom/compiler/vir_live_variables.c
+@@ -179,17 +179,22 @@ vir_setup_def_use(struct v3d_compile *c)
+                                 flags_inst = NULL;
+                         }
+ 
+-                        /* Payload registers: r0/1/2 contain W, centroid W,
+-                         * and Z at program start.  Register allocation will
+-                         * force their nodes to R0/1/2.
++                        /* Payload registers: for fragment shaders, W,
++                         * centroid W, and Z will be initialized at r0/1/2
++                         * until v42, or r1/r2/r3 from v71.
++                         *
++                         * For compute shaders, payload would be r0/r2 until
++                         * v42, r3/r2 from v71
++                         *
++                         * Register allocation will force their nodes to those
++                         * registers.
+                          */
+                         if (inst->src[0].file == QFILE_REG) {
+-                                switch (inst->src[0].index) {
+-                                case 0:
+-                                case 1:
+-                                case 2:
++                                uint32_t min_payload_r = c->devinfo->ver >= 71 ? 1 : 0;
++                                uint32_t max_payload_r = c->devinfo->ver >= 71 ? 3 : 2;
++                                if (inst->src[0].index >= min_payload_r ||
++                                    inst->src[0].index <= max_payload_r) {
+                                         c->temp_start[inst->dst.index] = 0;
+-                                        break;
+                                 }
+                         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch b/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch
new file mode 100644
index 0000000000..28e2ba2dec
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0050-broadcom-compiler-update-peripheral-access-restricti.patch
@@ -0,0 +1,235 @@
+From f9a76b3a1e316e5ed6387819b87eaaf60f989a2b Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 26 Oct 2021 11:43:02 +0200
+Subject: [PATCH 050/142] broadcom/compiler: update peripheral access
+ restrictions for v71
+
+In V3D 4.x only a couple of simultaneous accesses where allowed, but
+V3D 7.x is a bit more flexible, so rather than trying to check for all
+the allowed combinations it is easier to check if we are one of the
+disallows.
+
+Shader-db (pi5):
+
+total instructions in shared programs: 11338883 -> 11307386 (-0.28%)
+instructions in affected programs: 2727201 -> 2695704 (-1.15%)
+helped: 12555
+HURT: 289
+Instructions are helped.
+
+total max-temps in shared programs: 2230199 -> 2229260 (-0.04%)
+max-temps in affected programs: 20508 -> 19569 (-4.58%)
+helped: 608
+HURT: 4
+Max-temps are helped.
+
+total sfu-stalls in shared programs: 15236 -> 15293 (0.37%)
+sfu-stalls in affected programs: 148 -> 205 (38.51%)
+helped: 38
+HURT: 64
+Inconclusive result (%-change mean confidence interval includes 0).
+
+total inst-and-stalls in shared programs: 11354119 -> 11322679 (-0.28%)
+inst-and-stalls in affected programs: 2732262 -> 2700822 (-1.15%)
+helped: 12550
+HURT: 304
+Inst-and-stalls are helped.
+
+total nops in shared programs: 273711 -> 274095 (0.14%)
+nops in affected programs: 9626 -> 10010 (3.99%)
+helped: 186
+HURT: 397
+Nops are HURT.
+---
+ src/broadcom/compiler/qpu_schedule.c | 88 +++++++++++++++++++++-------
+ src/broadcom/compiler/qpu_validate.c |  2 +-
+ src/broadcom/qpu/qpu_instr.c         | 16 +++--
+ src/broadcom/qpu/qpu_instr.h         |  2 +
+ 4 files changed, 82 insertions(+), 26 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index e8197661f89..adb501e85ce 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -790,7 +790,8 @@ enum {
+         V3D_PERIPHERAL_TMU_WAIT           = (1 << 6),
+         V3D_PERIPHERAL_TMU_WRTMUC_SIG     = (1 << 7),
+         V3D_PERIPHERAL_TSY                = (1 << 8),
+-        V3D_PERIPHERAL_TLB                = (1 << 9),
++        V3D_PERIPHERAL_TLB_READ           = (1 << 9),
++        V3D_PERIPHERAL_TLB_WRITE          = (1 << 10),
+ };
+ 
+ static uint32_t
+@@ -815,8 +816,10 @@ qpu_peripherals(const struct v3d_device_info *devinfo,
+         if (v3d_qpu_uses_sfu(inst))
+                 result |= V3D_PERIPHERAL_SFU;
+ 
+-        if (v3d_qpu_uses_tlb(inst))
+-                result |= V3D_PERIPHERAL_TLB;
++        if (v3d_qpu_reads_tlb(inst))
++                result |= V3D_PERIPHERAL_TLB_READ;
++        if (v3d_qpu_writes_tlb(inst))
++                result |= V3D_PERIPHERAL_TLB_WRITE;
+ 
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+@@ -847,32 +850,75 @@ qpu_compatible_peripheral_access(const struct v3d_device_info *devinfo,
+         if (devinfo->ver < 41)
+                 return false;
+ 
+-        /* V3D 4.1+ allow WRTMUC signal with TMU register write (other than
+-         * tmuc).
++        /* V3D 4.x can't do more than one peripheral access except in a
++         * few cases:
+          */
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
+-            b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
+-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
++        if (devinfo->ver <= 42) {
++                /* WRTMUC signal with TMU register write (other than tmuc). */
++                if (a_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                    b_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
++                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, b);
++                }
++                if (b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                    a_peripherals == V3D_PERIPHERAL_TMU_WRITE) {
++                        return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
++                }
++
++                /* TMU read with VPM read/write. */
++                if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
++                    (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
++                     b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
++                        return true;
++                }
++                if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
++                    (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
++                     a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
++                        return true;
++                }
++
++                return false;
+         }
+ 
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_WRITE &&
+-            b_peripherals == V3D_PERIPHERAL_TMU_WRTMUC_SIG) {
+-                return v3d_qpu_writes_tmu_not_tmuc(devinfo, a);
++        /* V3D 7.x can't have more than one of these restricted peripherals */
++        const uint32_t restricted = V3D_PERIPHERAL_TMU_WRITE |
++                                    V3D_PERIPHERAL_TMU_WRTMUC_SIG |
++                                    V3D_PERIPHERAL_TSY |
++                                    V3D_PERIPHERAL_TLB_READ |
++                                    V3D_PERIPHERAL_SFU |
++                                    V3D_PERIPHERAL_VPM_READ |
++                                    V3D_PERIPHERAL_VPM_WRITE;
++
++        const uint32_t a_restricted = a_peripherals & restricted;
++        const uint32_t b_restricted = b_peripherals & restricted;
++        if (a_restricted && b_restricted) {
++                /* WRTMUC signal with TMU register write (other than tmuc) is
++                 * allowed though.
++                 */
++                if (!((a_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                       b_restricted == V3D_PERIPHERAL_TMU_WRITE &&
++                       v3d_qpu_writes_tmu_not_tmuc(devinfo, b)) ||
++                      (b_restricted == V3D_PERIPHERAL_TMU_WRTMUC_SIG &&
++                       a_restricted == V3D_PERIPHERAL_TMU_WRITE &&
++                       v3d_qpu_writes_tmu_not_tmuc(devinfo, a)))) {
++                        return false;
++                }
+         }
+ 
+-        /* V3D 4.1+ allows TMU read with VPM read/write. */
+-        if (a_peripherals == V3D_PERIPHERAL_TMU_READ &&
+-            (b_peripherals == V3D_PERIPHERAL_VPM_READ ||
+-             b_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+-                return true;
++        /* Only one TMU read per instruction */
++        if ((a_peripherals & V3D_PERIPHERAL_TMU_READ) &&
++            (b_peripherals & V3D_PERIPHERAL_TMU_READ)) {
++                return false;
+         }
+-        if (b_peripherals == V3D_PERIPHERAL_TMU_READ &&
+-            (a_peripherals == V3D_PERIPHERAL_VPM_READ ||
+-             a_peripherals == V3D_PERIPHERAL_VPM_WRITE)) {
+-                return true;
++
++        /* Only one TLB access per instruction */
++        if ((a_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
++                              V3D_PERIPHERAL_TLB_READ)) &&
++            (b_peripherals & (V3D_PERIPHERAL_TLB_WRITE |
++                              V3D_PERIPHERAL_TLB_READ))) {
++                return false;
+         }
+ 
+-        return false;
++        return true;
+ }
+ 
+ /* Compute a bitmask of which rf registers are used between
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 12788692432..fde6695d59b 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -227,7 +227,7 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+             vpm_writes +
+             tlb_writes +
+             tsy_writes +
+-            inst->sig.ldtmu +
++            (devinfo->ver <= 42 ? inst->sig.ldtmu : 0) +
+             inst->sig.ldtlb +
+             inst->sig.ldvpm +
+             inst->sig.ldtlbu > 1) {
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index 195a0dcd232..f54ce7210fb 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -649,12 +649,14 @@ v3d_qpu_add_op_writes_vpm(enum  v3d_qpu_add_op op)
+ }
+ 
+ bool
+-v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
++v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst)
+ {
+-        if (inst->sig.ldtlb ||
+-            inst->sig.ldtlbu)
+-                return true;
++        return inst->sig.ldtlb || inst->sig.ldtlbu;
++}
+ 
++bool
++v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst)
++{
+         if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if (inst->alu.add.op != V3D_QPU_A_NOP &&
+                     inst->alu.add.magic_write &&
+@@ -672,6 +674,12 @@ v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
+         return false;
+ }
+ 
++bool
++v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst)
++{
++        return  v3d_qpu_writes_tlb(inst) || v3d_qpu_reads_tlb(inst);
++}
++
+ bool
+ v3d_qpu_uses_sfu(const struct v3d_qpu_instr *inst)
+ {
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index 4b34d17bd4c..dece45c5c54 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -472,6 +472,8 @@ bool v3d_qpu_magic_waddr_is_tlb(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_is_vpm(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_is_tsy(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
+ bool v3d_qpu_magic_waddr_loads_unif(enum v3d_qpu_waddr waddr) ATTRIBUTE_CONST;
++bool v3d_qpu_reads_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
++bool v3d_qpu_writes_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_uses_tlb(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_instr_is_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+ bool v3d_qpu_instr_is_legacy_sfu(const struct v3d_qpu_instr *inst) ATTRIBUTE_CONST;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch b/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch
new file mode 100644
index 0000000000..0002304bd8
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0051-broadcom-qpu-add-packing-for-fmov-on-ADD-alu.patch
@@ -0,0 +1,61 @@
+From 3520cceb87fb2f9765ba7dbe2771fbd0cadca78d Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 26 Oct 2021 08:37:54 +0200
+Subject: [PATCH 051/142] broadcom/qpu: add packing for fmov on ADD alu
+
+---
+ src/broadcom/qpu/qpu_pack.c | 31 +++++++++++++++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 180d7ab08a3..ed5a8bc667d 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -1332,6 +1332,20 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+                 }
+                 break;
+ 
++        case V3D_QPU_A_FMOV:
++                instr->alu.add.output_pack = raddr_b & 0x3;
++
++                /* Mul alu FMOV has one additional variant */
++                int32_t unpack = (raddr_b >> 2) & 0x7;
++                if (unpack == 7)
++                        return false;
++
++                if (!v3d_qpu_float32_unpack_unpack(unpack,
++                                                   &instr->alu.add.a.unpack)) {
++                        return false;
++                }
++                break;
++
+         default:
+                 instr->alu.add.output_pack = V3D_QPU_PACK_NONE;
+                 instr->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+@@ -1992,6 +2006,23 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 break;
+         }
+ 
++        case V3D_QPU_A_FMOV: {
++                uint32_t packed;
++
++                if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
++                                               &packed)) {
++                        return false;
++                }
++                raddr_b = packed;
++
++                if (!v3d_qpu_float32_unpack_pack(instr->alu.add.a.unpack,
++                                                 &packed)) {
++                        return false;
++                }
++                raddr_b |= packed << 2;
++                break;
++        }
++
+         default:
+                 if (instr->alu.add.op != V3D_QPU_A_NOP &&
+                     (instr->alu.add.output_pack != V3D_QPU_PACK_NONE ||
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch b/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch
new file mode 100644
index 0000000000..f173a0f4c0
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0052-broadcom-compiler-handle-rf0-flops-storage-restricti.patch
@@ -0,0 +1,155 @@
+From 7c7ab15b3c9def4bc3bb5be492228a933c325f8a Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 6 Oct 2021 13:58:27 +0200
+Subject: [PATCH 052/142] broadcom/compiler: handle rf0 flops storage
+ restriction in v71
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 81 +++++++++++++++++++++++++++-
+ 1 file changed, 79 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index adb501e85ce..7048d9257b6 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -538,6 +538,10 @@ struct choose_scoreboard {
+         int ldvary_count;
+         int pending_ldtmu_count;
+         bool first_ldtmu_after_thrsw;
++
++        /* V3D 7.x */
++        int last_implicit_rf0_write_tick;
++        bool has_rf0_flops_conflict;
+ };
+ 
+ static bool
+@@ -1499,6 +1503,62 @@ update_scoreboard_tmu_tracking(struct choose_scoreboard *scoreboard,
+         }
+ }
+ 
++static void
++set_has_rf0_flops_conflict(struct choose_scoreboard *scoreboard,
++                           const struct v3d_qpu_instr *inst,
++                           const struct v3d_device_info *devinfo)
++{
++        if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick &&
++            v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
++            !inst->sig_magic) {
++                scoreboard->has_rf0_flops_conflict = true;
++        }
++}
++
++static void
++update_scoreboard_for_rf0_flops(struct choose_scoreboard *scoreboard,
++                                const struct v3d_qpu_instr *inst,
++                                const struct v3d_device_info *devinfo)
++{
++        if (devinfo->ver < 71)
++                return;
++
++        /* Thread switch restrictions:
++         *
++         * At the point of a thread switch or thread end (when the actual
++         * thread switch or thread end happens, not when the signalling
++         * instruction is processed):
++         *
++         *    - If the most recent write to rf0 was from a ldunif, ldunifa, or
++         *      ldvary instruction in which another signal also wrote to the
++         *      register file, and the final instruction of the thread section
++         *      contained a signal which wrote to the register file, then the
++         *      value of rf0 is undefined at the start of the new section
++         *
++         * Here we use the scoreboard to track if our last rf0 implicit write
++         * happens at the same time that another signal writes the register
++         * file (has_rf0_flops_conflict). We will use that information when
++         * scheduling thrsw instructions to avoid putting anything in their
++         * last delay slot which has a signal that writes to the register file.
++         */
++
++        /* Reset tracking if we have an explicit rf0 write or we are starting
++         * a new thread section.
++         */
++        if (v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0) ||
++            scoreboard->tick - scoreboard->last_thrsw_tick == 3) {
++                scoreboard->last_implicit_rf0_write_tick = -10;
++                scoreboard->has_rf0_flops_conflict = false;
++        }
++
++        if (v3d_qpu_writes_rf0_implicitly(devinfo, inst)) {
++                scoreboard->last_implicit_rf0_write_tick = inst->sig.ldvary ?
++                        scoreboard->tick + 1 : scoreboard->tick;
++        }
++
++        set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
++}
++
+ static void
+ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+                              const struct qinst *qinst,
+@@ -1542,6 +1602,8 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
+         if (inst->sig.ldvary)
+                 scoreboard->last_ldvary_tick = scoreboard->tick;
+ 
++        update_scoreboard_for_rf0_flops(scoreboard, inst, devinfo);
++
+         update_scoreboard_tmu_tracking(scoreboard, qinst);
+ }
+ 
+@@ -1812,6 +1874,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+  */
+ static bool
+ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
++                                          struct choose_scoreboard *scoreboard,
+                                           const struct qinst *qinst,
+                                           uint32_t slot)
+ {
+@@ -1842,6 +1905,17 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+         if (v3d_qpu_writes_unifa(c->devinfo, &qinst->qpu))
+                 return false;
+ 
++        /* See comment when we set has_rf0_flops_conflict for details */
++        if (c->devinfo->ver >= 71 &&
++            slot == 2 &&
++            v3d_qpu_sig_writes_address(c->devinfo, &qinst->qpu.sig) &&
++            !qinst->qpu.sig_magic) {
++                if (scoreboard->has_rf0_flops_conflict)
++                        return false;
++                if (scoreboard->last_implicit_rf0_write_tick == scoreboard->tick)
++                        return false;
++        }
++
+         return true;
+ }
+ 
+@@ -1874,7 +1948,7 @@ qpu_inst_after_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+          * also apply to instructions scheduled after the thrsw that we want
+          * to place in its delay slots.
+          */
+-        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
++        if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard, qinst, slot))
+                 return false;
+ 
+         /* TLB access is disallowed until scoreboard wait is executed, which
+@@ -1947,8 +2021,10 @@ valid_thrsw_sequence(struct v3d_compile *c, struct choose_scoreboard *scoreboard
+                      bool is_thrend)
+ {
+         for (int slot = 0; slot < instructions_in_sequence; slot++) {
+-                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, qinst, slot))
++                if (!qpu_inst_before_thrsw_valid_in_delay_slot(c, scoreboard,
++                                                               qinst, slot)) {
+                         return false;
++                }
+ 
+                 if (is_thrend &&
+                     !qpu_inst_valid_in_thrend_slot(c, qinst, slot)) {
+@@ -2718,6 +2794,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
+         scoreboard.last_setmsf_tick = -10;
+         scoreboard.last_stallable_sfu_tick = -10;
+         scoreboard.first_ldtmu_after_thrsw = true;
++        scoreboard.last_implicit_rf0_write_tick = - 10;
+ 
+         if (debug) {
+                 fprintf(stderr, "Pre-schedule instructions\n");
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch
new file mode 100644
index 0000000000..ffd2489d53
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0053-broadcom-compiler-enable-ldvary-pipelining-on-v71.patch
@@ -0,0 +1,189 @@
+From 0c6910721eb50b38b3388c2d2344b6ecfe0fee58 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 27 Oct 2021 11:35:12 +0200
+Subject: [PATCH 053/142] broadcom/compiler: enable ldvary pipelining on v71
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 121 ++++++++++++++++++---------
+ 1 file changed, 80 insertions(+), 41 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 7048d9257b6..334ffdc6d58 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -2312,46 +2312,72 @@ emit_branch(struct v3d_compile *c,
+ }
+ 
+ static bool
+-alu_reads_register(struct v3d_qpu_instr *inst,
++alu_reads_register(const struct v3d_device_info *devinfo,
++                   struct v3d_qpu_instr *inst,
+                    bool add, bool magic, uint32_t index)
+ {
+         uint32_t num_src;
+-        enum v3d_qpu_mux mux_a, mux_b;
+-
+-        if (add) {
++        if (add)
+                 num_src = v3d_qpu_add_op_num_src(inst->alu.add.op);
+-                mux_a = inst->alu.add.a.mux;
+-                mux_b = inst->alu.add.b.mux;
+-        } else {
++        else
+                 num_src = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+-                mux_a = inst->alu.mul.a.mux;
+-                mux_b = inst->alu.mul.b.mux;
+-        }
+ 
+-        for (int i = 0; i < num_src; i++) {
+-                if (magic) {
+-                        if (i == 0 && mux_a == index)
+-                                return true;
+-                        if (i == 1 && mux_b == index)
+-                                return true;
++        if (devinfo->ver <= 42) {
++                enum v3d_qpu_mux mux_a, mux_b;
++                if (add) {
++                        mux_a = inst->alu.add.a.mux;
++                        mux_b = inst->alu.add.b.mux;
+                 } else {
+-                        if (i == 0 && mux_a == V3D_QPU_MUX_A &&
+-                            inst->raddr_a == index) {
+-                                return true;
+-                        }
+-                        if (i == 0 && mux_a == V3D_QPU_MUX_B &&
+-                            inst->raddr_b == index) {
+-                                return true;
+-                        }
+-                        if (i == 1 && mux_b == V3D_QPU_MUX_A &&
+-                            inst->raddr_a == index) {
+-                                return true;
+-                        }
+-                        if (i == 1 && mux_b == V3D_QPU_MUX_B &&
+-                            inst->raddr_b == index) {
+-                                return true;
++                        mux_a = inst->alu.mul.a.mux;
++                        mux_b = inst->alu.mul.b.mux;
++                }
++
++                for (int i = 0; i < num_src; i++) {
++                        if (magic) {
++                                if (i == 0 && mux_a == index)
++                                        return true;
++                                if (i == 1 && mux_b == index)
++                                        return true;
++                        } else {
++                                if (i == 0 && mux_a == V3D_QPU_MUX_A &&
++                                    inst->raddr_a == index) {
++                                        return true;
++                                }
++                                if (i == 0 && mux_a == V3D_QPU_MUX_B &&
++                                    inst->raddr_b == index) {
++                                        return true;
++                                }
++                                if (i == 1 && mux_b == V3D_QPU_MUX_A &&
++                                    inst->raddr_a == index) {
++                                        return true;
++                                }
++                                if (i == 1 && mux_b == V3D_QPU_MUX_B &&
++                                    inst->raddr_b == index) {
++                                        return true;
++                                }
+                         }
+                 }
++
++                return false;
++        }
++
++        assert(devinfo->ver >= 71);
++        assert(!magic);
++
++        uint32_t raddr_a, raddr_b;
++        if (add) {
++                raddr_a = inst->alu.add.a.raddr;
++                raddr_b = inst->alu.add.b.raddr;
++        } else {
++                raddr_a = inst->alu.mul.a.raddr;
++                raddr_b = inst->alu.mul.b.raddr;
++        }
++
++        for (int i = 0; i < num_src; i++) {
++                if (i == 0 && raddr_a == index)
++                        return true;
++                if (i == 1 && raddr_b == index)
++                        return true;
+         }
+ 
+         return false;
+@@ -2386,6 +2412,8 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+                        struct qblock *block,
+                        struct v3d_qpu_instr *inst)
+ {
++        const struct v3d_device_info *devinfo = c->devinfo;
++
+         /* We only call this if we have successfully merged an ldvary into a
+          * previous instruction.
+          */
+@@ -2398,9 +2426,9 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+          * the ldvary destination, if it does, then moving the ldvary before
+          * it would overwrite it.
+          */
+-        if (alu_reads_register(inst, true, ldvary_magic, ldvary_index))
++        if (alu_reads_register(devinfo, inst, true, ldvary_magic, ldvary_index))
+                 return false;
+-        if (alu_reads_register(inst, false, ldvary_magic, ldvary_index))
++        if (alu_reads_register(devinfo, inst, false, ldvary_magic, ldvary_index))
+                 return false;
+ 
+         /* The implicit ldvary destination may not be written to by a signal
+@@ -2436,13 +2464,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+         }
+ 
+         /* The previous instruction cannot have a conflicting signal */
+-        if (v3d_qpu_sig_writes_address(c->devinfo, &prev->qpu.sig))
++        if (v3d_qpu_sig_writes_address(devinfo, &prev->qpu.sig))
+                 return false;
+ 
+         uint32_t sig;
+         struct v3d_qpu_sig new_sig = prev->qpu.sig;
+         new_sig.ldvary = true;
+-        if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig))
++        if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
+                 return false;
+ 
+         /* The previous instruction cannot use flags since ldvary uses the
+@@ -2471,14 +2499,25 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+         inst->sig_magic = false;
+         inst->sig_addr = 0;
+ 
+-        /* By moving ldvary to the previous instruction we make it update
+-         * r5 in the current one, so nothing else in it should write r5.
+-         * This should've been prevented by our dependency tracking, which
++        /* Update rf0 flops tracking for new ldvary delayed rf0 write tick */
++        if (devinfo->ver >= 71) {
++                scoreboard->last_implicit_rf0_write_tick = scoreboard->tick;
++                set_has_rf0_flops_conflict(scoreboard, inst, devinfo);
++        }
++
++        /* By moving ldvary to the previous instruction we make it update r5
++         * (rf0 for ver >= 71) in the current one, so nothing else in it
++         * should write this register.
++         *
++         * This should've been prevented by our depedency tracking, which
+          * would not allow ldvary to be paired up with an instruction that
+-         * writes r5 (since our dependency tracking doesn't know that the
+-         * ldvary write r5 happens in the next instruction).
++         * writes r5/rf0 (since our dependency tracking doesn't know that the
++         * ldvary write to r5/rf0 happens in the next instruction).
+          */
+-        assert(!v3d_qpu_writes_r5(c->devinfo, inst));
++        assert(!v3d_qpu_writes_r5(devinfo, inst));
++        assert(devinfo->ver <= 42 ||
++               (!v3d_qpu_writes_rf0_implicitly(devinfo, inst) &&
++                !v3d71_qpu_writes_waddr_explicitly(devinfo, inst, 0)));
+ 
+         return true;
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch b/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch
new file mode 100644
index 0000000000..5e4dc3adce
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0054-broadcom-compiler-try-to-use-ldunif-a-instead-of-ldu.patch
@@ -0,0 +1,144 @@
+From 0670d642bb91fc68ce73f2d9fb88c482295a446d Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 28 Oct 2021 14:13:29 +0200
+Subject: [PATCH 054/142] broadcom/compiler: try to use ldunif(a) instead of
+ ldunif(a)rf in v71
+
+The rf variants need to encode the destination in the cond bits, which
+prevents these to be merged with any other instruction that need them.
+
+In 4.x, ldunif(a) write to r5 which is a special register that only
+ldunif(a) and ldvary can write so we have a special register class for
+it and only allow it for them. Then when we need to choose a register
+for a node, if this register is available we always use it.
+
+In 7.x these instructions write to rf0, which can be used by any
+instruction, so instead of restricting rf0, we track the temps that
+are used as ldunif(a) destinations and use that information to favor
+rf0 for them.
+---
+ src/broadcom/compiler/v3d_compiler.h          |  3 ++
+ src/broadcom/compiler/vir_register_allocate.c | 34 ++++++++++++++++---
+ src/broadcom/compiler/vir_to_qpu.c            | 11 ++++--
+ 3 files changed, 41 insertions(+), 7 deletions(-)
+
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index 7e8f3bfc1a7..36adf8830b5 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -613,6 +613,9 @@ struct v3d_ra_node_info {
+         struct {
+                 uint32_t priority;
+                 uint8_t class_bits;
++
++                /* V3D 7.x */
++                bool is_ldunif_dst;
+         } *info;
+         uint32_t alloc_count;
+ };
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index e0adc1de7a4..1be091f8518 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -384,6 +384,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+         /* We fill the node priority after we are done inserting spills */
+         c->nodes.info[node].class_bits = class_bits;
+         c->nodes.info[node].priority = 0;
++        c->nodes.info[node].is_ldunif_dst = false;
+ }
+ 
+ /* The spill offset for this thread takes a bit of setup, so do it once at
+@@ -899,9 +900,22 @@ v3d_ra_select_accum(struct v3d_ra_select_callback_data *v3d_ra,
+ 
+ static bool
+ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
++                 unsigned int node,
+                  BITSET_WORD *regs,
+                  unsigned int *out)
+ {
++        /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
++         * so we can avoid turning them into ldunifrf (which uses the
++         * cond field to encode the dst and would prevent merge with
++         * instructions that use cond flags).
++         */
++        if (v3d_ra->nodes->info[node].is_ldunif_dst &&
++            BITSET_TEST(regs, v3d_ra->phys_index)) {
++                assert(v3d_ra->devinfo->ver >= 71);
++                *out = v3d_ra->phys_index;
++                return true;
++        }
++
+         for (int i = 0; i < PHYS_COUNT; i++) {
+                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+                 int phys = v3d_ra->phys_index + phys_off;
+@@ -927,7 +941,7 @@ v3d_ra_select_callback(unsigned int n, BITSET_WORD *regs, void *data)
+                 return reg;
+         }
+ 
+-        if (v3d_ra_select_rf(v3d_ra, regs, &reg))
++        if (v3d_ra_select_rf(v3d_ra, n, regs, &reg))
+                 return reg;
+ 
+         /* If we ran out of physical registers try to assign an accumulator
+@@ -1139,15 +1153,24 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                                 }
+                         }
+                 } else {
+-                        /* If the instruction has an implicit write
+-                         * we can't allocate its dest to the same
+-                         * register.
++                        /* Make sure we don't allocate the ldvary's
++                         * destination to rf0, since it would clash
++                         * with its implicit write to that register.
+                          */
+-                        if (v3d_qpu_writes_rf0_implicitly(c->devinfo, &inst->qpu)) {
++                        if (inst->qpu.sig.ldvary) {
+                                 ra_add_node_interference(c->g,
+                                                          temp_to_node(c, inst->dst.index),
+                                                          implicit_rf_nodes[0]);
+                         }
++                        /* Flag dst temps from ldunif(a) instructions
++                         * so we can try to assign rf0 to them and avoid
++                         * converting these to ldunif(a)rf.
++                         */
++                        if (inst->qpu.sig.ldunif || inst->qpu.sig.ldunifa) {
++                                const uint32_t dst_n =
++                                        temp_to_node(c, inst->dst.index);
++                                c->nodes.info[dst_n].is_ldunif_dst = true;
++                        }
+                 }
+         }
+ 
+@@ -1222,6 +1245,7 @@ v3d_register_allocate(struct v3d_compile *c)
+          * without accumulators that can have implicit writes to phys regs.
+          */
+         for (uint32_t i = 0; i < num_ra_nodes; i++) {
++                c->nodes.info[i].is_ldunif_dst = false;
+                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                         acc_nodes[i] = i;
+                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index afc4941fdb1..cbbb495592b 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -345,8 +345,15 @@ v3d_generate_code_block(struct v3d_compile *c,
+                                 assert(qinst->qpu.alu.add.op == V3D_QPU_A_NOP);
+                                 assert(qinst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+ 
+-                                if (!dst.magic ||
+-                                    dst.index != V3D_QPU_WADDR_R5) {
++                                bool use_rf;
++                                if (c->devinfo->has_accumulators) {
++                                        use_rf = !dst.magic ||
++                                                 dst.index != V3D_QPU_WADDR_R5;
++                                } else {
++                                        use_rf = dst.magic || dst.index != 0;
++                                }
++
++                                if (use_rf) {
+                                         assert(c->devinfo->ver >= 40);
+ 
+                                         if (qinst->qpu.sig.ldunif) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch b/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch
new file mode 100644
index 0000000000..d03707a3fc
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0055-broadcom-compiler-don-t-assign-rf0-to-temps-that-con.patch
@@ -0,0 +1,82 @@
+From cbed3b97394da09c9ae644c79e098e3ba8b5c3e8 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 29 Oct 2021 13:00:56 +0200
+Subject: [PATCH 055/142] broadcom/compiler: don't assign rf0 to temps that
+ conflict with ldvary
+
+ldvary writes to rf0 implicitly, so we don't want to allocate rf0 to
+any temps that are live across ldvary's rf0 live ranges.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 39 ++++++++++++++++++-
+ 1 file changed, 38 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 1be091f8518..6f7b1ca0589 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -1019,6 +1019,7 @@ static void
+ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                                       int *acc_nodes,
+                                       int *implicit_rf_nodes,
++                                      int last_ldvary_ip,
+                                       struct qinst *inst)
+ {
+         int32_t ip = inst->ip;
+@@ -1125,6 +1126,25 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+                 }
+         }
+ 
++        /* Don't allocate rf0 to temps that cross ranges where we have
++         * live implicit rf0 writes from ldvary. We can identify these
++         * by tracking the last ldvary instruction and explicit reads
++         * of rf0.
++         */
++        if (c->devinfo->ver >= 71 &&
++            ((inst->src[0].file == QFILE_REG && inst->src[0].index == 0) ||
++              (vir_get_nsrc(inst) > 1 &&
++               inst->src[1].file == QFILE_REG && inst->src[1].index == 0))) {
++                for (int i = 0; i < c->num_temps; i++) {
++                        if (c->temp_start[i] < ip &&
++                            c->temp_end[i] > last_ldvary_ip) {
++                                        ra_add_node_interference(c->g,
++                                                                 temp_to_node(c, i),
++                                                                 implicit_rf_nodes[0]);
++                        }
++                }
++        }
++
+         if (inst->dst.file == QFILE_TEMP) {
+                 /* Only a ldunif gets to write to R5, which only has a
+                  * single 32-bit channel of storage.
+@@ -1270,10 +1290,27 @@ v3d_register_allocate(struct v3d_compile *c)
+          * interferences.
+          */
+         int ip = 0;
++        int last_ldvary_ip = -1;
+         vir_for_each_inst_inorder(inst, c) {
+                 inst->ip = ip++;
++
++                /* ldunif(a) always write to a temporary, so we have
++                 * liveness info available to decide if rf0 is
++                 * available for them, however, ldvary is different:
++                 * it always writes to rf0 directly so we don't have
++                 * liveness information for its implicit rf0 write.
++                 *
++                 * That means the allocator may assign rf0 to a temp
++                 * that is defined while an implicit rf0 write from
++                 * ldvary is still live. We fix that by manually
++                 * tracking rf0 live ranges from ldvary instructions.
++                 */
++                if (inst->qpu.sig.ldvary)
++                        last_ldvary_ip = ip;
++
+                 update_graph_and_reg_classes_for_inst(c, acc_nodes,
+-                                                      implicit_rf_nodes, inst);
++                                                      implicit_rf_nodes,
++                                                      last_ldvary_ip, inst);
+         }
+ 
+         /* Set the register classes for all our temporaries in the graph */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch b/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch
new file mode 100644
index 0000000000..dac7b03bfc
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0056-broadcom-compiler-convert-mul-to-add-when-needed-to-.patch
@@ -0,0 +1,139 @@
+From cbaa469c09974c1574b16f559173694904fe1bb0 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 25 Oct 2021 09:38:57 +0200
+Subject: [PATCH 056/142] broadcom/compiler: convert mul to add when needed to
+ allow merge
+
+V3D 7.x added 'mov' opcodes to the ADD alu, so now it is possible to
+move these to the ADD alu to facilitate merging them with other MUL
+instructions.
+---
+ src/broadcom/compiler/qpu_schedule.c | 102 ++++++++++++++++++++++++---
+ 1 file changed, 94 insertions(+), 8 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 334ffdc6d58..caa84254998 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1086,6 +1086,57 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+         inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
+ }
+ 
++static bool
++can_do_mul_as_add(const struct v3d_device_info *devinfo, enum v3d_qpu_mul_op op)
++{
++        switch (op) {
++        case V3D_QPU_M_MOV:
++        case V3D_QPU_M_FMOV:
++                return devinfo->ver >= 71;
++        default:
++                return false;
++        }
++}
++
++static enum v3d_qpu_mul_op
++mul_op_as_add_op(enum v3d_qpu_mul_op op)
++{
++        switch (op) {
++        case V3D_QPU_M_MOV:
++                return V3D_QPU_A_MOV;
++        case V3D_QPU_M_FMOV:
++                return V3D_QPU_A_FMOV;
++        default:
++                unreachable("unexpected mov opcode");
++        }
++}
++
++static void
++qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
++{
++        STATIC_ASSERT(sizeof(inst->alu.add) == sizeof(inst->alu.mul));
++        assert(inst->alu.mul.op != V3D_QPU_M_NOP);
++        assert(inst->alu.add.op == V3D_QPU_A_NOP);
++
++        memcpy(&inst->alu.add, &inst->alu.mul, sizeof(inst->alu.add));
++        inst->alu.add.op = mul_op_as_add_op(inst->alu.mul.op);
++        inst->alu.mul.op = V3D_QPU_M_NOP;
++
++        inst->flags.ac = inst->flags.mc;
++        inst->flags.apf = inst->flags.mpf;
++        inst->flags.auf = inst->flags.muf;
++        inst->flags.mc = V3D_QPU_COND_NONE;
++        inst->flags.mpf = V3D_QPU_PF_NONE;
++        inst->flags.muf = V3D_QPU_UF_NONE;
++
++        inst->alu.add.output_pack = inst->alu.mul.output_pack;
++        inst->alu.add.a.unpack = inst->alu.mul.a.unpack;
++        inst->alu.add.b.unpack = inst->alu.mul.b.unpack;
++        inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
++        inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
++        inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++}
++
+ static bool
+ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                struct v3d_qpu_instr *result,
+@@ -1151,17 +1202,52 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                 }
+         }
+ 
++        struct v3d_qpu_instr add_inst;
+         if (b->alu.mul.op != V3D_QPU_M_NOP) {
+-                if (a->alu.mul.op != V3D_QPU_M_NOP)
+-                        return false;
+-                merge.alu.mul = b->alu.mul;
++                if (a->alu.mul.op == V3D_QPU_M_NOP) {
++                        merge.alu.mul = b->alu.mul;
++
++                        merge.flags.mc = b->flags.mc;
++                        merge.flags.mpf = b->flags.mpf;
++                        merge.flags.muf = b->flags.muf;
++
++                        mul_instr = b;
++                        add_instr = a;
++                }
++                /* If a's mul op is used but its add op is not, then see if we
++                 * can convert either a's mul op or b's mul op to an add op
++                 * so we can merge.
++                 */
++                else if (a->alu.add.op == V3D_QPU_A_NOP &&
++                         can_do_mul_as_add(devinfo, b->alu.mul.op)) {
++                        add_inst = *b;
++                        qpu_convert_mul_to_add(&add_inst);
+ 
+-                merge.flags.mc = b->flags.mc;
+-                merge.flags.mpf = b->flags.mpf;
+-                merge.flags.muf = b->flags.muf;
++                        merge.alu.add = add_inst.alu.add;
+ 
+-                mul_instr = b;
+-                add_instr = a;
++                        merge.flags.ac = b->flags.mc;
++                        merge.flags.apf = b->flags.mpf;
++                        merge.flags.auf = b->flags.muf;
++
++                        mul_instr = a;
++                        add_instr = &add_inst;
++                } else if (a->alu.add.op == V3D_QPU_A_NOP &&
++                           can_do_mul_as_add(devinfo, a->alu.mul.op)) {
++                        add_inst = *a;
++                        qpu_convert_mul_to_add(&add_inst);
++
++                        merge = add_inst;
++                        merge.alu.mul = b->alu.mul;
++
++                        merge.flags.mc = b->flags.mc;
++                        merge.flags.mpf = b->flags.mpf;
++                        merge.flags.muf = b->flags.muf;
++
++                        mul_instr = b;
++                        add_instr = &add_inst;
++                } else {
++                        return false;
++                }
+         }
+ 
+         /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch
new file mode 100644
index 0000000000..02310764ef
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0057-broadcom-compiler-implement-small-immediates-for-v71.patch
@@ -0,0 +1,418 @@
+From b59b3725fb16f4ab1ac0db86a5452a4ed6176074 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 3 Nov 2021 10:34:19 +0100
+Subject: [PATCH 057/142] broadcom/compiler: implement small immediates for v71
+
+---
+ src/broadcom/compiler/qpu_schedule.c          | 90 +++++++++++++------
+ src/broadcom/compiler/qpu_validate.c          | 20 ++++-
+ .../compiler/vir_opt_small_immediates.c       | 26 +++++-
+ src/broadcom/compiler/vir_to_qpu.c            | 11 ++-
+ src/broadcom/qpu/qpu_disasm.c                 |  1 -
+ src/broadcom/qpu/qpu_instr.c                  |  8 +-
+ src/broadcom/qpu/qpu_instr.h                  |  2 +-
+ src/broadcom/qpu/qpu_pack.c                   | 36 ++++----
+ 8 files changed, 139 insertions(+), 55 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index caa84254998..bd1c920848a 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -714,7 +714,6 @@ qpu_instruction_uses_rf(const struct v3d_device_info *devinfo,
+                     !inst->sig.small_imm_b && (inst->raddr_b == waddr))
+                         return true;
+         } else {
+-                /* FIXME: skip if small immediate */
+                 if (v3d71_qpu_reads_raddr(inst, waddr))
+                         return true;
+         }
+@@ -948,10 +947,11 @@ qpu_raddrs_used(const struct v3d_qpu_instr *a,
+         return raddrs_used;
+ }
+ 
+-/* Take two instructions and attempt to merge their raddr fields
+- * into one merged instruction. Returns false if the two instructions
+- * access more than two different rf registers between them, or more
+- * than one rf register and one small immediate.
++/* Takes two instructions and attempts to merge their raddr fields (including
++ * small immediates) into one merged instruction. For V3D 4.x, returns false
++ * if the two instructions access more than two different rf registers between
++ * them, or more than one rf register and one small immediate. For 7.x returns
++ * false if both instructions use small immediates.
+  */
+ static bool
+ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+@@ -959,6 +959,27 @@ qpu_merge_raddrs(struct v3d_qpu_instr *result,
+                  const struct v3d_qpu_instr *mul_instr,
+                  const struct v3d_device_info *devinfo)
+ {
++        if (devinfo->ver >= 71) {
++                assert(add_instr->sig.small_imm_a +
++                       add_instr->sig.small_imm_b <= 1);
++                assert(add_instr->sig.small_imm_c +
++                       add_instr->sig.small_imm_d == 0);
++                assert(mul_instr->sig.small_imm_a +
++                       mul_instr->sig.small_imm_b == 0);
++                assert(mul_instr->sig.small_imm_c +
++                       mul_instr->sig.small_imm_d <= 1);
++
++                result->sig.small_imm_a = add_instr->sig.small_imm_a;
++                result->sig.small_imm_b = add_instr->sig.small_imm_b;
++                result->sig.small_imm_c = mul_instr->sig.small_imm_c;
++                result->sig.small_imm_d = mul_instr->sig.small_imm_d;
++
++                return (result->sig.small_imm_a +
++                        result->sig.small_imm_b +
++                        result->sig.small_imm_c +
++                        result->sig.small_imm_d) <= 1;
++        }
++
+         assert(devinfo->ver <= 42);
+ 
+         uint64_t raddrs_used = qpu_raddrs_used(add_instr, mul_instr);
+@@ -1060,7 +1081,8 @@ add_op_as_mul_op(enum v3d_qpu_add_op op)
+ }
+ 
+ static void
+-qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
++qpu_convert_add_to_mul(const struct v3d_device_info *devinfo,
++                       struct v3d_qpu_instr *inst)
+ {
+         STATIC_ASSERT(sizeof(inst->alu.mul) == sizeof(inst->alu.add));
+         assert(inst->alu.add.op != V3D_QPU_A_NOP);
+@@ -1084,6 +1106,18 @@ qpu_convert_add_to_mul(struct v3d_qpu_instr *inst)
+         inst->alu.add.output_pack = V3D_QPU_PACK_NONE;
+         inst->alu.add.a.unpack = V3D_QPU_UNPACK_NONE;
+         inst->alu.add.b.unpack = V3D_QPU_UNPACK_NONE;
++
++        if (devinfo->ver >= 71) {
++                assert(!inst->sig.small_imm_c && !inst->sig.small_imm_d);
++                assert(inst->sig.small_imm_a + inst->sig.small_imm_b <= 1);
++                if (inst->sig.small_imm_a) {
++                        inst->sig.small_imm_c = true;
++                        inst->sig.small_imm_a = false;
++                } else if (inst->sig.small_imm_b) {
++                        inst->sig.small_imm_d = true;
++                        inst->sig.small_imm_b = false;
++                }
++        }
+ }
+ 
+ static bool
+@@ -1135,6 +1169,16 @@ qpu_convert_mul_to_add(struct v3d_qpu_instr *inst)
+         inst->alu.mul.output_pack = V3D_QPU_PACK_NONE;
+         inst->alu.mul.a.unpack = V3D_QPU_UNPACK_NONE;
+         inst->alu.mul.b.unpack = V3D_QPU_UNPACK_NONE;
++
++        assert(!inst->sig.small_imm_a && !inst->sig.small_imm_b);
++        assert(inst->sig.small_imm_c + inst->sig.small_imm_d <= 1);
++        if (inst->sig.small_imm_c) {
++                inst->sig.small_imm_a = true;
++                inst->sig.small_imm_c = false;
++        } else if (inst->sig.small_imm_d) {
++                inst->sig.small_imm_b = true;
++                inst->sig.small_imm_d = false;
++        }
+ }
+ 
+ static bool
+@@ -1173,20 +1217,20 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+                 else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                          can_do_add_as_mul(b->alu.add.op)) {
+                         mul_inst = *b;
+-                        qpu_convert_add_to_mul(&mul_inst);
++                        qpu_convert_add_to_mul(devinfo, &mul_inst);
+ 
+                         merge.alu.mul = mul_inst.alu.mul;
+ 
+-                        merge.flags.mc = b->flags.ac;
+-                        merge.flags.mpf = b->flags.apf;
+-                        merge.flags.muf = b->flags.auf;
++                        merge.flags.mc = mul_inst.flags.mc;
++                        merge.flags.mpf = mul_inst.flags.mpf;
++                        merge.flags.muf = mul_inst.flags.muf;
+ 
+                         add_instr = a;
+                         mul_instr = &mul_inst;
+                 } else if (a->alu.mul.op == V3D_QPU_M_NOP &&
+                            can_do_add_as_mul(a->alu.add.op)) {
+                         mul_inst = *a;
+-                        qpu_convert_add_to_mul(&mul_inst);
++                        qpu_convert_add_to_mul(devinfo, &mul_inst);
+ 
+                         merge = mul_inst;
+                         merge.alu.add = b->alu.add;
+@@ -1225,9 +1269,9 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+ 
+                         merge.alu.add = add_inst.alu.add;
+ 
+-                        merge.flags.ac = b->flags.mc;
+-                        merge.flags.apf = b->flags.mpf;
+-                        merge.flags.auf = b->flags.muf;
++                        merge.flags.ac = add_inst.flags.ac;
++                        merge.flags.apf = add_inst.flags.apf;
++                        merge.flags.auf = add_inst.flags.auf;
+ 
+                         mul_instr = a;
+                         add_instr = &add_inst;
+@@ -1252,17 +1296,12 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+ 
+         /* V3D 4.x and earlier use muxes to select the inputs for the ALUs and
+          * they have restrictions on the number of raddrs that can be adressed
+-         * in a single instruction.
+-         *
+-         * FIXME: for V3D 7.x we can't merge instructions if they address more
+-         * than one small immediate. For now, we don't support small immediates,
+-         * so it is not a problem.
++         * in a single instruction. In V3D 7.x, we don't have that restriction,
++         * but we are still limited to a single small immediate per instruction.
+          */
+-        if (devinfo->ver <= 42) {
+-                if (add_instr && mul_instr &&
+-                    !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
+-                                return false;
+-                }
++        if (add_instr && mul_instr &&
++            !qpu_merge_raddrs(&merge, add_instr, mul_instr, devinfo)) {
++                return false;
+         }
+ 
+         merge.sig.thrsw |= b->sig.thrsw;
+@@ -1273,7 +1312,6 @@ qpu_merge_inst(const struct v3d_device_info *devinfo,
+         merge.sig.ldtmu |= b->sig.ldtmu;
+         merge.sig.ldvary |= b->sig.ldvary;
+         merge.sig.ldvpm |= b->sig.ldvpm;
+-        merge.sig.small_imm_b |= b->sig.small_imm_b;
+         merge.sig.ldtlb |= b->sig.ldtlb;
+         merge.sig.ldtlbu |= b->sig.ldtlbu;
+         merge.sig.ucb |= b->sig.ucb;
+@@ -1933,8 +1971,6 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+                 if (c->devinfo->ver >= 71) {
+                         /* RF2-3 might be overwritten during the delay slots by
+                          * fragment shader setup.
+-                         *
+-                         * FIXME: handle small immediate cases
+                          */
+                         if (v3d71_qpu_reads_raddr(inst, 2) ||
+                             v3d71_qpu_reads_raddr(inst, 3)) {
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index fde6695d59b..41070484286 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -116,8 +116,24 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+                 return;
+ 
+         if (devinfo->ver < 71) {
+-           if (inst->sig.small_imm_a || inst->sig.small_imm_c || inst->sig.small_imm_d)
+-              fail_instr(state, "small imm a/c/d added after V3D 7.1");
++                if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
++                    inst->sig.small_imm_d) {
++                        fail_instr(state, "small imm a/c/d added after V3D 7.1");
++                }
++        } else {
++                if ((inst->sig.small_imm_a || inst->sig.small_imm_b) &&
++                    !vir_is_add(qinst)) {
++                        fail_instr(state, "small imm a/b used but no ADD inst");
++                }
++                if ((inst->sig.small_imm_c || inst->sig.small_imm_d) &&
++                    !vir_is_mul(qinst)) {
++                        fail_instr(state, "small imm c/d used but no MUL inst");
++                }
++                if (inst->sig.small_imm_a + inst->sig.small_imm_b +
++                    inst->sig.small_imm_c + inst->sig.small_imm_d > 1) {
++                        fail_instr(state, "only one small immediate can be "
++                                   "enabled per instruction");
++                }
+         }
+ 
+         /* LDVARY writes r5 two instructions later and LDUNIF writes
+diff --git a/src/broadcom/compiler/vir_opt_small_immediates.c b/src/broadcom/compiler/vir_opt_small_immediates.c
+index df0d6c36c9b..ed5bc011964 100644
+--- a/src/broadcom/compiler/vir_opt_small_immediates.c
++++ b/src/broadcom/compiler/vir_opt_small_immediates.c
+@@ -44,7 +44,9 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                 /* The small immediate value sits in the raddr B field, so we
+                  * can't have 2 small immediates in one instruction (unless
+                  * they're the same value, but that should be optimized away
+-                 * elsewhere).
++                 * elsewhere). Since 7.x we can encode small immediates in
++                 * any raddr field, but each instruction can still only use
++                 * one.
+                  */
+                 bool uses_small_imm = false;
+                 for (int i = 0; i < vir_get_nsrc(inst); i++) {
+@@ -80,7 +82,22 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                          */
+                         struct v3d_qpu_sig new_sig = inst->qpu.sig;
+                         uint32_t sig_packed;
+-                        new_sig.small_imm_b = true;
++                        if (c->devinfo->ver <= 42) {
++                                new_sig.small_imm_b = true;
++                        } else {
++                               if (vir_is_add(inst)) {
++                                       if (i == 0)
++                                               new_sig.small_imm_a = true;
++                                       else
++                                               new_sig.small_imm_b = true;
++                               } else {
++                                       if (i == 0)
++                                               new_sig.small_imm_c = true;
++                                       else
++                                               new_sig.small_imm_d = true;
++                               }
++                        }
++
+                         if (!v3d_qpu_sig_pack(c->devinfo, &new_sig, &sig_packed))
+                                 continue;
+ 
+@@ -89,7 +106,10 @@ vir_opt_small_immediates(struct v3d_compile *c)
+                                 vir_dump_inst(c, inst);
+                                 fprintf(stderr, "\n");
+                         }
+-                        inst->qpu.sig.small_imm_b = true;
++                        inst->qpu.sig.small_imm_a = new_sig.small_imm_a;
++                        inst->qpu.sig.small_imm_b = new_sig.small_imm_b;
++                        inst->qpu.sig.small_imm_c = new_sig.small_imm_c;
++                        inst->qpu.sig.small_imm_d = new_sig.small_imm_d;
+                         inst->qpu.raddr_b = packed;
+ 
+                         inst->src[i].file = QFILE_SMALL_IMM;
+diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
+index cbbb495592b..4ed184cbbcb 100644
+--- a/src/broadcom/compiler/vir_to_qpu.c
++++ b/src/broadcom/compiler/vir_to_qpu.c
+@@ -89,8 +89,15 @@ new_qpu_nop_before(struct qinst *inst)
+ static void
+ v3d71_set_src(struct v3d_qpu_instr *instr, uint8_t *raddr, struct qpu_reg src)
+ {
+-        if (src.smimm)
+-                unreachable("v3d71_set_src: pending handling small immediates");
++        /* If we have a small immediate move it from inst->raddr_b to the
++         * corresponding raddr.
++         */
++        if (src.smimm) {
++                assert(instr->sig.small_imm_a || instr->sig.small_imm_b ||
++                       instr->sig.small_imm_c || instr->sig.small_imm_d);
++                *raddr = instr->raddr_b;
++                return;
++        }
+ 
+         assert(!src.magic);
+         *raddr = src.index;
+diff --git a/src/broadcom/qpu/qpu_disasm.c b/src/broadcom/qpu/qpu_disasm.c
+index b613de781dc..c1590a760de 100644
+--- a/src/broadcom/qpu/qpu_disasm.c
++++ b/src/broadcom/qpu/qpu_disasm.c
+@@ -113,7 +113,6 @@ v3d71_qpu_disasm_raddr(struct disasm_state *disasm,
+         }
+ 
+         if (is_small_imm) {
+-                unreachable("Pending handling small immediates");
+                 uint32_t val;
+                 ASSERTED bool ok =
+                         v3d_qpu_small_imm_unpack(disasm->devinfo,
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index f54ce7210fb..c30f4bbbccf 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -975,10 +975,10 @@ v3d71_qpu_reads_raddr(const struct v3d_qpu_instr *inst, uint8_t raddr)
+         int add_nsrc = v3d_qpu_add_op_num_src(inst->alu.add.op);
+         int mul_nsrc = v3d_qpu_mul_op_num_src(inst->alu.mul.op);
+ 
+-        return (add_nsrc > 0 && inst->alu.add.a.raddr == raddr) ||
+-               (add_nsrc > 1 && inst->alu.add.b.raddr == raddr) ||
+-               (mul_nsrc > 0 && inst->alu.mul.a.raddr == raddr) ||
+-               (mul_nsrc > 1 && inst->alu.mul.b.raddr == raddr);
++        return (add_nsrc > 0 && !inst->sig.small_imm_a && inst->alu.add.a.raddr == raddr) ||
++               (add_nsrc > 1 && !inst->sig.small_imm_b && inst->alu.add.b.raddr == raddr) ||
++               (mul_nsrc > 0 && !inst->sig.small_imm_c && inst->alu.mul.a.raddr == raddr) ||
++               (mul_nsrc > 1 && !inst->sig.small_imm_d && inst->alu.mul.b.raddr == raddr);
+ }
+ 
+ bool
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index dece45c5c54..d408fb426fa 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -402,7 +402,7 @@ struct v3d_qpu_instr {
+         uint8_t sig_addr;
+         bool sig_magic; /* If the signal writes to a magic address */
+         uint8_t raddr_a; /* V3D 4.x */
+-        uint8_t raddr_b; /* V3D 4.x*/
++        uint8_t raddr_b; /* V3D 4.x (holds packed small immediate in 7.x too) */
+         struct v3d_qpu_flags flags;
+ 
+         union {
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index ed5a8bc667d..7984712d527 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -1218,16 +1218,11 @@ v3d71_qpu_add_unpack(const struct v3d_device_info *devinfo, uint64_t packed_inst
+ 
+         instr->alu.add.op = desc->op;
+ 
+-        /* FADD/FADDNF and FMIN/FMAX are determined by the orders of the
++        /* FADD/FADDNF and FMIN/FMAX are determined by the order of the
+          * operands.
+          */
+-        /* FIXME: for now hardcoded values, until we got the small_imm support
+-         * in place
+-         */
+-        uint32_t small_imm_a = 0;
+-        uint32_t small_imm_b = 0;
+-        if (small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
+-            small_imm_b *256 + (op & 3) * 64 + raddr_b) {
++        if (instr->sig.small_imm_a * 256 + ((op >> 2) & 3) * 64 + raddr_a >
++            instr->sig.small_imm_b * 256 + (op & 3) * 64 + raddr_b) {
+                 if (instr->alu.add.op == V3D_QPU_A_FMIN)
+                         instr->alu.add.op = V3D_QPU_A_FMAX;
+                 if (instr->alu.add.op == V3D_QPU_A_FADD)
+@@ -1858,11 +1853,6 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                 uint32_t output_pack;
+                 uint32_t a_unpack;
+                 uint32_t b_unpack;
+-                /* FIXME: for now hardcoded values, until we got the small_imm
+-                 * support in place
+-                 */
+-                uint32_t small_imm_a = 0;
+-                uint32_t small_imm_b = 0;
+ 
+                 if (instr->alu.add.op != V3D_QPU_A_FCMP) {
+                         if (!v3d_qpu_float32_pack_pack(instr->alu.add.output_pack,
+@@ -1886,8 +1876,8 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                  * distinguished by which order their operands come in.
+                  */
+                 bool ordering =
+-                        small_imm_a * 256 + a_unpack * 64 + raddr_a >
+-                        small_imm_b * 256 + b_unpack * 64 + raddr_b;
++                        instr->sig.small_imm_a * 256 + a_unpack * 64 + raddr_a >
++                        instr->sig.small_imm_b * 256 + b_unpack * 64 + raddr_b;
+                 if (((instr->alu.add.op == V3D_QPU_A_FMIN ||
+                       instr->alu.add.op == V3D_QPU_A_FADD) && ordering) ||
+                     ((instr->alu.add.op == V3D_QPU_A_FMAX ||
+@@ -1901,6 +1891,22 @@ v3d71_qpu_add_pack(const struct v3d_device_info *devinfo,
+                         temp = raddr_a;
+                         raddr_a = raddr_b;
+                         raddr_b = temp;
++
++                        /* If we are swapping raddr_a/b we also need to swap
++                         * small_imm_a/b.
++                         */
++                        if (instr->sig.small_imm_a || instr->sig.small_imm_b) {
++                                assert(instr->sig.small_imm_a !=
++                                       instr->sig.small_imm_b);
++                                struct v3d_qpu_sig new_sig = instr->sig;
++                                new_sig.small_imm_a = !instr->sig.small_imm_a;
++                                new_sig.small_imm_b = !instr->sig.small_imm_b;
++                                uint32_t sig;
++                                if (!v3d_qpu_sig_pack(devinfo, &new_sig, &sig))
++                                    return false;
++                            *packed_instr &= ~V3D_QPU_SIG_MASK;
++                            *packed_instr |= QPU_SET_FIELD(sig, V3D_QPU_SIG);
++                        }
+                 }
+ 
+                 opcode |= a_unpack << 2;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch b/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch
new file mode 100644
index 0000000000..cd5c07f5eb
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0058-broadcom-compiler-update-thread-end-restrictions-for.patch
@@ -0,0 +1,61 @@
+From 3af87d2672da7c928ecf8a0a1cd1bef8a6729364 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 22 Nov 2021 12:56:03 +0100
+Subject: [PATCH 058/142] broadcom/compiler: update thread end restrictions for
+ v7.x
+
+In 4.x it is not allowed to write to the register file in the last
+3 instructions, but in 7.x we only have this restriction in the
+thread end instruction itself, and only if the write comes from
+the ALU ports.
+---
+ src/broadcom/compiler/qpu_schedule.c | 31 ++++++++++++++++++++--------
+ 1 file changed, 22 insertions(+), 9 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index bd1c920848a..cba16c77d67 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1938,17 +1938,30 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+                         return false;
+                 }
+ 
+-                /* No writing physical registers at the end. */
+-                bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
+-                bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
+-                if ((!add_is_nop && !inst->alu.add.magic_write) ||
+-                    (!mul_is_nop && !inst->alu.mul.magic_write)) {
+-                        return false;
++                if (c->devinfo->ver <= 42) {
++                        /* No writing physical registers at the end. */
++                        bool add_is_nop = inst->alu.add.op == V3D_QPU_A_NOP;
++                        bool mul_is_nop = inst->alu.mul.op == V3D_QPU_M_NOP;
++                        if ((!add_is_nop && !inst->alu.add.magic_write) ||
++                            (!mul_is_nop && !inst->alu.mul.magic_write)) {
++                                return false;
++                        }
++
++                        if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
++                            !inst->sig_magic) {
++                                return false;
++                        }
+                 }
+ 
+-                if (v3d_qpu_sig_writes_address(c->devinfo, &inst->sig) &&
+-                    !inst->sig_magic) {
+-                        return false;
++                if (c->devinfo->ver >= 71) {
++                        /* The thread end instruction must not write to the
++                         * register file via the add/mul ALUs.
++                         */
++                        if (slot == 0 &&
++                            (!inst->alu.add.magic_write ||
++                             !inst->alu.mul.magic_write)) {
++                                return false;
++                        }
+                 }
+ 
+                 if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch b/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch
new file mode 100644
index 0000000000..515f12d5d5
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0059-broadcom-compiler-update-ldvary-thread-switch-delay-.patch
@@ -0,0 +1,112 @@
+From 7cfd5b808bb2f1cb17f57435cb5d411c4ac3aa6c Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 23 Nov 2021 10:04:49 +0100
+Subject: [PATCH 059/142] broadcom/compiler: update ldvary thread switch delay
+ slot restriction for v7.x
+
+In V3D 7.x we don't have accumulators which would not survive a thread
+switch, so the only restriction is that ldvary can't be placed in the second
+delay slot of a thread switch.
+
+shader-db results for UnrealEngine4 shaders:
+
+total instructions in shared programs: 446458 -> 446401 (-0.01%)
+instructions in affected programs: 13492 -> 13435 (-0.42%)
+helped: 58
+HURT: 3
+Instructions are helped.
+
+total nops in shared programs: 19571 -> 19541 (-0.15%)
+nops in affected programs: 161 -> 131 (-18.63%)
+helped: 30
+HURT: 0
+Nops are helped.
+---
+ src/broadcom/compiler/qpu_schedule.c | 33 +++++++++++++++++++++-------
+ src/broadcom/compiler/qpu_validate.c | 10 +++++++--
+ 2 files changed, 33 insertions(+), 10 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index cba16c77d67..32f651851cf 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1491,11 +1491,20 @@ retry:
+                          * ldvary now if the follow-up fixup would place
+                          * it in the delay slots of a thrsw, which is not
+                          * allowed and would prevent the fixup from being
+-                         * successful.
++                         * successful. In V3D 7.x we can allow this to happen
++                         * as long as it is not the last delay slot.
+                          */
+-                        if (inst->sig.ldvary &&
+-                            scoreboard->last_thrsw_tick + 2 >= scoreboard->tick - 1) {
+-                                continue;
++                        if (inst->sig.ldvary) {
++                                if (c->devinfo->ver <= 42 &&
++                                    scoreboard->last_thrsw_tick + 2 >=
++                                    scoreboard->tick - 1) {
++                                        continue;
++                                }
++                                if (c->devinfo->ver >= 71 &&
++                                    scoreboard->last_thrsw_tick + 2 ==
++                                    scoreboard->tick - 1) {
++                                        continue;
++                                }
+                         }
+ 
+                         /* We can emit a new tmu lookup with a previous ldtmu
+@@ -2020,8 +2029,12 @@ qpu_inst_before_thrsw_valid_in_delay_slot(struct v3d_compile *c,
+         if (slot > 0 && v3d_qpu_instr_is_legacy_sfu(&qinst->qpu))
+                 return false;
+ 
+-        if (slot > 0 && qinst->qpu.sig.ldvary)
+-                return false;
++        if (qinst->qpu.sig.ldvary) {
++                if (c->devinfo->ver <= 42 && slot > 0)
++                        return false;
++                if (c->devinfo->ver >= 71 && slot == 2)
++                        return false;
++        }
+ 
+         /* unifa and the following 3 instructions can't overlap a
+          * thread switch/end. The docs further clarify that this means
+@@ -2618,9 +2631,13 @@ fixup_pipelined_ldvary(struct v3d_compile *c,
+ 
+         /* We can't put an ldvary in the delay slots of a thrsw. We should've
+          * prevented this when pairing up the ldvary with another instruction
+-         * and flagging it for a fixup.
++         * and flagging it for a fixup. In V3D 7.x this is limited only to the
++         * second delay slot.
+          */
+-        assert(scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1);
++        assert((devinfo->ver <= 42 &&
++                scoreboard->last_thrsw_tick + 2 < scoreboard->tick - 1) ||
++               (devinfo->ver >= 71 &&
++                scoreboard->last_thrsw_tick + 2 != scoreboard->tick - 1));
+ 
+         /* Move the ldvary to the previous instruction and remove it from the
+          * current one.
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 41070484286..4f09aa8aef4 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -215,8 +215,14 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+                                    "SFU write started during THRSW delay slots ");
+                 }
+ 
+-                if (inst->sig.ldvary)
+-                        fail_instr(state, "LDVARY during THRSW delay slots");
++                if (inst->sig.ldvary) {
++                        if (devinfo->ver <= 42)
++                                fail_instr(state, "LDVARY during THRSW delay slots");
++                        if (devinfo->ver >= 71 &&
++                            state->ip - state->last_thrsw_ip == 2) {
++                                fail_instr(state, "LDVARY in 2nd THRSW delay slot");
++                        }
++                }
+         }
+ 
+         (void)qpu_magic_waddr_matches; /* XXX */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch b/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch
new file mode 100644
index 0000000000..7c78c6938b
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0060-broadcom-compiler-lift-restriction-for-branch-msfign.patch
@@ -0,0 +1,30 @@
+From ca4063d627cd31c589a8e8688f2876dd8211d1bc Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 25 Nov 2021 08:31:02 +0100
+Subject: [PATCH 060/142] broadcom/compiler: lift restriction for branch +
+ msfign after setmsf for v7.x
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 32f651851cf..476eae691ab 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -2373,10 +2373,11 @@ emit_branch(struct v3d_compile *c,
+         assert(scoreboard->last_branch_tick + 3 < branch_tick);
+         assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
+ 
+-        /* Can't place a branch with msfign != 0 and cond != 0,2,3 after
++        /* V3D 4.x can't place a branch with msfign != 0 and cond != 0,2,3 after
+          * setmsf.
+          */
+         bool is_safe_msf_branch =
++                c->devinfo->ver >= 71 ||
+                 inst->qpu.branch.msfign == V3D_QPU_MSFIGN_NONE ||
+                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_ALWAYS ||
+                 inst->qpu.branch.cond == V3D_QPU_BRANCH_COND_A0 ||
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch
new file mode 100644
index 0000000000..8bff29c318
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0061-broadcom-compiler-start-allocating-from-RF-4-in-V7.x.patch
@@ -0,0 +1,38 @@
+From 167510aa43bbcf06e57a64495cee40e8cdaf5f8b Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 26 Nov 2021 10:37:05 +0100
+Subject: [PATCH 061/142] broadcom/compiler: start allocating from RF 4 in V7.x
+
+In V3D 4.x we start at RF3 so that we allocate RF0-2 only if there
+aren't any other RFs available. This is useful with small shaders
+to ensure that our TLB writes don't use these registers because
+these are the last instructions we emit in fragment shaders and
+the last instructions in a program can't write to these registers,
+so if we do, we need to emit NOPs.
+
+In V3D 7.x the registers affected by this restriction are RF2-3,
+so we choose to start at RF4.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 6f7b1ca0589..440b093a636 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -1234,9 +1234,10 @@ v3d_register_allocate(struct v3d_compile *c)
+                 .phys_index = phys_index,
+                 .next_acc = 0,
+                 /* Start at RF3, to try to keep the TLB writes from using
+-                 * RF0-2.
++                 * RF0-2. Start at RF4 in 7.x to prevent TLB writes from
++                 * using RF2-3.
+                  */
+-                .next_phys = 3,
++                .next_phys = c->devinfo->ver <= 42 ? 3 : 4,
+                 .nodes = &c->nodes,
+                 .devinfo = c->devinfo,
+         };
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch b/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch
new file mode 100644
index 0000000000..f1f210e47a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0062-broadcom-compiler-validate-restrictions-after-TLB-Z-.patch
@@ -0,0 +1,71 @@
+From d47ea903b96e43b07bdef21f8026da818e30fcd1 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 25 Nov 2021 13:00:34 +0100
+Subject: [PATCH 062/142] broadcom/compiler: validate restrictions after TLB Z
+ write
+
+---
+ src/broadcom/compiler/qpu_validate.c | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 4f09aa8aef4..1082fb7d50a 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -41,6 +41,7 @@ struct v3d_qpu_validate_state {
+         int last_sfu_write;
+         int last_branch_ip;
+         int last_thrsw_ip;
++        int first_tlb_z_write;
+ 
+         /* Set when we've found the last-THRSW signal, or if we were started
+          * in single-segment mode.
+@@ -110,11 +111,37 @@ static void
+ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+ {
+         const struct v3d_device_info *devinfo = state->c->devinfo;
++
++        if (qinst->is_tlb_z_write && state->ip < state->first_tlb_z_write)
++                state->first_tlb_z_write = state->ip;
++
+         const struct v3d_qpu_instr *inst = &qinst->qpu;
+ 
++        if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH &&
++            state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write &&
++            inst->branch.msfign != V3D_QPU_MSFIGN_NONE &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_ALWAYS &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_A0 &&
++            inst->branch.cond != V3D_QPU_BRANCH_COND_NA0) {
++                fail_instr(state, "Implicit branch MSF read after TLB Z write");
++        }
++
+         if (inst->type != V3D_QPU_INSTR_TYPE_ALU)
+                 return;
+ 
++        if (inst->alu.add.op == V3D_QPU_A_SETMSF &&
++            state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write) {
++                fail_instr(state, "SETMSF after TLB Z write");
++        }
++
++        if (state->first_tlb_z_write >= 0 &&
++            state->ip > state->first_tlb_z_write &&
++            inst->alu.add.op == V3D_QPU_A_MSF) {
++                fail_instr(state, "MSF read after TLB Z write");
++        }
++
+         if (devinfo->ver < 71) {
+                 if (inst->sig.small_imm_a || inst->sig.small_imm_c ||
+                     inst->sig.small_imm_d) {
+@@ -348,6 +375,7 @@ qpu_validate(struct v3d_compile *c)
+                 .last_sfu_write = -10,
+                 .last_thrsw_ip = -10,
+                 .last_branch_ip = -10,
++                .first_tlb_z_write = INT_MAX,
+                 .ip = 0,
+ 
+                 .last_thrsw_found = !c->last_thrsw,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch b/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch
new file mode 100644
index 0000000000..7cfdab4c05
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0063-broadcom-compiler-lift-restriction-on-vpmwt-in-last-.patch
@@ -0,0 +1,26 @@
+From 6cdf01fad49489b5fc66d231b527de5245d5de32 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 29 Nov 2021 13:23:11 +0100
+Subject: [PATCH 063/142] broadcom/compiler: lift restriction on vpmwt in last
+ instruction for V3D 7.x
+
+---
+ src/broadcom/compiler/qpu_schedule.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 476eae691ab..77fb6a794e6 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -1934,7 +1934,7 @@ qpu_inst_valid_in_thrend_slot(struct v3d_compile *c,
+         if (slot > 0 && qinst->uniform != ~0)
+                 return false;
+ 
+-        if (v3d_qpu_waits_vpm(inst))
++        if (c->devinfo->ver <= 42 && v3d_qpu_waits_vpm(inst))
+                 return false;
+ 
+         if (inst->sig.ldvary)
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch
new file mode 100644
index 0000000000..080764c6d0
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0064-broadcom-compiler-fix-up-copy-propagation-for-v71.patch
@@ -0,0 +1,134 @@
+From acc54637f0787ba4dc887130c25c628ccdaf4e38 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 9 Nov 2021 11:34:59 +0100
+Subject: [PATCH 064/142] broadcom/compiler: fix up copy propagation for v71
+
+Update rules for unsafe copy propagations to match v7.x.
+---
+ .../compiler/vir_opt_copy_propagate.c         | 83 +++++++++++++------
+ 1 file changed, 56 insertions(+), 27 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_opt_copy_propagate.c b/src/broadcom/compiler/vir_opt_copy_propagate.c
+index c4aa7255a17..1260838ca05 100644
+--- a/src/broadcom/compiler/vir_opt_copy_propagate.c
++++ b/src/broadcom/compiler/vir_opt_copy_propagate.c
+@@ -35,7 +35,7 @@
+ #include "v3d_compiler.h"
+ 
+ static bool
+-is_copy_mov(struct qinst *inst)
++is_copy_mov(const struct v3d_device_info *devinfo, struct qinst *inst)
+ {
+         if (!inst)
+                 return false;
+@@ -62,36 +62,65 @@ is_copy_mov(struct qinst *inst)
+                 return false;
+         }
+ 
+-        switch (inst->src[0].file) {
+-        case QFILE_MAGIC:
+-                /* No copy propagating from R3/R4/R5 -- the MOVs from those
+-                 * are there to register allocate values produced into R3/4/5
+-                 * to other regs (though hopefully r3/4/5).
+-                 */
+-                switch (inst->src[0].index) {
+-                case V3D_QPU_WADDR_R3:
+-                case V3D_QPU_WADDR_R4:
+-                case V3D_QPU_WADDR_R5:
+-                        return false;
++        if (devinfo->ver <= 42) {
++                switch (inst->src[0].file) {
++                case QFILE_MAGIC:
++                        /* No copy propagating from R3/R4/R5 -- the MOVs from
++                         * those are there to register allocate values produced
++                         * into R3/4/5 to other regs (though hopefully r3/4/5).
++                         */
++                        switch (inst->src[0].index) {
++                        case V3D_QPU_WADDR_R3:
++                        case V3D_QPU_WADDR_R4:
++                        case V3D_QPU_WADDR_R5:
++                                return false;
++                        default:
++                                break;
++                        }
++                        break;
++
++                case QFILE_REG:
++                        switch (inst->src[0].index) {
++                        case 0:
++                        case 1:
++                        case 2:
++                                /* MOVs from rf0/1/2 are only to track the live
++                                 * intervals for W/centroid W/Z.
++                                 */
++                                return false;
++                        }
++                        break;
++
+                 default:
+                         break;
+                 }
+-                break;
+-
+-        case QFILE_REG:
+-                switch (inst->src[0].index) {
+-                case 0:
+-                case 1:
+-                case 2:
+-                        /* MOVs from rf0/1/2 are only to track the live
++        } else {
++                assert(devinfo->ver >= 71);
++                switch (inst->src[0].file) {
++                case QFILE_REG:
++                        switch (inst->src[0].index) {
++                        /* MOVs from rf1/2/3 are only to track the live
+                          * intervals for W/centroid W/Z.
++                         *
++                         * Note: rf0 can be implicitly written by ldvary
++                         * (no temp involved), so it is not an SSA value and
++                         * could clash with writes to other temps that are
++                         * also allocated to rf0. In theory, that would mean
++                         * that we can't copy propagate from it, but we handle
++                         * this at register allocation time, preventing temps
++                         * from being allocated to rf0 while the rf0 value from
++                         * ldvary is still live.
+                          */
+-                        return false;
+-                }
+-                break;
++                        case 1:
++                        case 2:
++                        case 3:
++                                return false;
++                        }
++                        break;
+ 
+-        default:
+-                break;
++                default:
++                        break;
++                }
+         }
+ 
+         return true;
+@@ -135,7 +164,7 @@ try_copy_prop(struct v3d_compile *c, struct qinst *inst, struct qinst **movs)
+                  */
+                 struct qinst *mov = movs[inst->src[i].index];
+                 if (!mov) {
+-                        if (!is_copy_mov(c->defs[inst->src[i].index]))
++                        if (!is_copy_mov(c->devinfo, c->defs[inst->src[i].index]))
+                                 continue;
+                         mov = c->defs[inst->src[i].index];
+ 
+@@ -245,7 +274,7 @@ vir_opt_copy_propagate(struct v3d_compile *c)
+ 
+                         apply_kills(c, movs, inst);
+ 
+-                        if (is_copy_mov(inst))
++                        if (is_copy_mov(c->devinfo, inst))
+                                 movs[inst->dst.index] = inst;
+                 }
+         }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch b/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch
new file mode 100644
index 0000000000..5bd7e35514
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0065-broadcom-qpu-new-packing-conversion-v71-instructions.patch
@@ -0,0 +1,150 @@
+From c340f7f1eb4a1e5c0fafe1ea2f801f2ebaf82d8d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 26 Nov 2021 01:24:12 +0100
+Subject: [PATCH 065/142] broadcom/qpu: new packing/conversion v71 instructions
+
+This commits adds the qpu definitions for several new v71
+instructions.
+
+Packing:
+  * vpack does a 2x32 to 2x16 bit integer pack
+  * v8pack: Pack 2 x 2x16 bit integers into 4x8 bits
+  * v10pack packs parts of 2 2x16 bit integer into r10g10b10a2.
+  * v11fpack packs parts of 2 2x16 bit float into r11g11b10 rounding
+    to nearest
+
+Conversion to unorm/snorm:
+  * vftounorm8/vftosnorm8: converts from 2x16-bit floating point
+    to 2x8 bit unorm/snorm.
+  * ftounorm16/ftosnorm16: converts floating point to 16-bit
+    unorm/snorm
+  * vftounorm10lo: Convert 2x16-bit floating point to 2x10-bit unorm
+  * vftounorm10hi: Convert 2x16-bit floating point to one 2-bit and one 10-bit unorm
+---
+ src/broadcom/qpu/qpu_instr.c | 20 ++++++++++++++++++++
+ src/broadcom/qpu/qpu_instr.h | 12 ++++++++++++
+ src/broadcom/qpu/qpu_pack.c  | 12 ++++++++++++
+ 3 files changed, 44 insertions(+)
+
+diff --git a/src/broadcom/qpu/qpu_instr.c b/src/broadcom/qpu/qpu_instr.c
+index c30f4bbbccf..44f20618a5a 100644
+--- a/src/broadcom/qpu/qpu_instr.c
++++ b/src/broadcom/qpu/qpu_instr.c
+@@ -179,6 +179,10 @@ v3d_qpu_add_op_name(enum v3d_qpu_add_op op)
+                 [V3D_QPU_A_UTOF] = "utof",
+                 [V3D_QPU_A_MOV] = "mov",
+                 [V3D_QPU_A_FMOV] = "fmov",
++                [V3D_QPU_A_VPACK] = "vpack",
++                [V3D_QPU_A_V8PACK] = "v8pack",
++                [V3D_QPU_A_V10PACK] = "v10pack",
++                [V3D_QPU_A_V11FPACK] = "v11fpack",
+         };
+ 
+         if (op >= ARRAY_SIZE(op_names))
+@@ -201,6 +205,12 @@ v3d_qpu_mul_op_name(enum v3d_qpu_mul_op op)
+                 [V3D_QPU_M_MOV] = "mov",
+                 [V3D_QPU_M_NOP] = "nop",
+                 [V3D_QPU_M_FMUL] = "fmul",
++                [V3D_QPU_M_FTOUNORM16] = "ftounorm16",
++                [V3D_QPU_M_FTOSNORM16] = "ftosnorm16",
++                [V3D_QPU_M_VFTOUNORM8] = "vftounorm8",
++                [V3D_QPU_M_VFTOSNORM8] = "vftosnorm8",
++                [V3D_QPU_M_VFTOUNORM10LO] = "vftounorm10lo",
++                [V3D_QPU_M_VFTOUNORM10HI] = "vftounorm10hi",
+         };
+ 
+         if (op >= ARRAY_SIZE(op_names))
+@@ -463,6 +473,10 @@ static const uint8_t add_op_args[] = {
+ 
+         [V3D_QPU_A_MOV] = D | A,
+         [V3D_QPU_A_FMOV] = D | A,
++        [V3D_QPU_A_VPACK] = D | A | B,
++        [V3D_QPU_A_V8PACK] = D | A | B,
++        [V3D_QPU_A_V10PACK] = D | A | B,
++        [V3D_QPU_A_V11FPACK] = D | A | B,
+ };
+ 
+ static const uint8_t mul_op_args[] = {
+@@ -476,6 +490,12 @@ static const uint8_t mul_op_args[] = {
+         [V3D_QPU_M_NOP] = 0,
+         [V3D_QPU_M_MOV] = D | A,
+         [V3D_QPU_M_FMUL] = D | A | B,
++        [V3D_QPU_M_FTOUNORM16] = D | A,
++        [V3D_QPU_M_FTOSNORM16] = D | A,
++        [V3D_QPU_M_VFTOUNORM8] = D | A,
++        [V3D_QPU_M_VFTOSNORM8] = D | A,
++        [V3D_QPU_M_VFTOUNORM10LO] = D | A,
++        [V3D_QPU_M_VFTOUNORM10HI] = D | A,
+ };
+ 
+ bool
+diff --git a/src/broadcom/qpu/qpu_instr.h b/src/broadcom/qpu/qpu_instr.h
+index d408fb426fa..56eee9f9cac 100644
+--- a/src/broadcom/qpu/qpu_instr.h
++++ b/src/broadcom/qpu/qpu_instr.h
+@@ -231,6 +231,10 @@ enum v3d_qpu_add_op {
+         /* V3D 7.x */
+         V3D_QPU_A_FMOV,
+         V3D_QPU_A_MOV,
++        V3D_QPU_A_VPACK,
++        V3D_QPU_A_V8PACK,
++        V3D_QPU_A_V10PACK,
++        V3D_QPU_A_V11FPACK,
+ };
+ 
+ enum v3d_qpu_mul_op {
+@@ -244,6 +248,14 @@ enum v3d_qpu_mul_op {
+         V3D_QPU_M_MOV,
+         V3D_QPU_M_NOP,
+         V3D_QPU_M_FMUL,
++
++        /* V3D 7.x */
++        V3D_QPU_M_FTOUNORM16,
++        V3D_QPU_M_FTOSNORM16,
++        V3D_QPU_M_VFTOUNORM8,
++        V3D_QPU_M_VFTOSNORM8,
++        V3D_QPU_M_VFTOUNORM10LO,
++        V3D_QPU_M_VFTOUNORM10HI,
+ };
+ 
+ enum v3d_qpu_output_pack {
+diff --git a/src/broadcom/qpu/qpu_pack.c b/src/broadcom/qpu/qpu_pack.c
+index 7984712d527..6cd75adac6d 100644
+--- a/src/broadcom/qpu/qpu_pack.c
++++ b/src/broadcom/qpu/qpu_pack.c
+@@ -783,6 +783,9 @@ static const struct opcode_desc add_ops_v71[] = {
+         { 246, 246, .raddr_mask = OP_RANGE(32, 34), V3D_QPU_A_ITOF, 71 },
+         { 246, 246, .raddr_mask = OP_RANGE(36, 38), V3D_QPU_A_UTOF, 71 },
+ 
++        { 247, 247, .raddr_mask = ANYOPMASK, V3D_QPU_A_VPACK, 71 },
++        { 248, 248, .raddr_mask = ANYOPMASK, V3D_QPU_A_V8PACK, 71 },
++
+         { 249, 249, .raddr_mask = OP_RANGE(0, 2),   V3D_QPU_A_FMOV, 71 },
+         { 249, 249, .raddr_mask = OP_RANGE(4, 6),   V3D_QPU_A_FMOV, 71 },
+         { 249, 249, .raddr_mask = OP_RANGE(8, 10),  V3D_QPU_A_FMOV, 71 },
+@@ -797,6 +800,8 @@ static const struct opcode_desc add_ops_v71[] = {
+         { 249, 249, .raddr_mask = OP_MASK(15), V3D_QPU_A_MOV, 71 },
+         { 249, 249, .raddr_mask = OP_MASK(19), V3D_QPU_A_MOV, 71 },
+ 
++        { 250, 250, .raddr_mask = ANYOPMASK, V3D_QPU_A_V10PACK, 71 },
++        { 251, 251, .raddr_mask = ANYOPMASK, V3D_QPU_A_V11FPACK, 71 },
+ };
+ 
+ static const struct opcode_desc mul_ops_v71[] = {
+@@ -822,6 +827,13 @@ static const struct opcode_desc mul_ops_v71[] = {
+         { 14, 14, .raddr_mask = OP_MASK(15), V3D_QPU_M_MOV, 71 },
+         { 14, 14, .raddr_mask = OP_MASK(19), V3D_QPU_M_MOV, 71 },
+ 
++        { 14, 14, .raddr_mask = OP_MASK(32), V3D_QPU_M_FTOUNORM16, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(33), V3D_QPU_M_FTOSNORM16, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(34), V3D_QPU_M_VFTOUNORM8, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(35), V3D_QPU_M_VFTOSNORM8, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(48), V3D_QPU_M_VFTOUNORM10LO, 71 },
++        { 14, 14, .raddr_mask = OP_MASK(49), V3D_QPU_M_VFTOUNORM10HI, 71 },
++
+         { 14, 14, .raddr_mask = OP_MASK(63), V3D_QPU_M_NOP, 71 },
+ 
+         { 16, 63, .raddr_mask = ANYOPMASK, V3D_QPU_M_FMUL },
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch b/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch
new file mode 100644
index 0000000000..ee65de9a53
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0066-nir-add-new-opcodes-to-map-new-v71-packing-conversio.patch
@@ -0,0 +1,221 @@
+From 4f33de7771621e15aae3e3c60c09fd5a2f29bdac Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 30 Nov 2021 02:39:20 +0100
+Subject: [PATCH 066/142] nir: add new opcodes to map new v71
+ packing/conversion instructions
+
+Since v71, broadcom hw include specific packing/conversion
+instructions, so this commit adds opcodes to be able to make use of
+them, specially for image stores:
+
+   * vftounorm8/vftosnorm8: 2x16-bit floating point to 2x8-bit
+     unorm/snorm
+   * ftounorm16/ftosnorm16: floating point to 16-bit unorm/snorm
+   * vftounorm10lo/vftounorm10hi: used to convert a floating point to
+     a r10g10b10a2 unorm
+
+   * v11fpack: packs 2 2x16 FP into R11G11B10.
+   * v10pack: pack 2 2x16 integer into R10G10B10A2
+   * v8pack: packs 2 2x16 bit integer into 4x8 bits.
+   * vpack: 2x32 bit to 2x16 integer pack
+
+For the latter, it can be easly confused with the existing and general
+pack_32_2x16_split. But note that this one receives two 16bit integer,
+and packs them on a 32bit integer. But broadcom opcode takes two 32bit
+integer, takes the lower halfword, and packs them as 2x16 on a 32bit
+integer.
+
+Interestingly broadcom also defines a similar one that packs the
+higher halfword. Not used yet.
+
+FIXME: vftounorm10lo/hi constant expression implementation is somewhat
+convoluted. It is likely that it could be implemented in a more easy
+way. But it works (passing the tests added with CTS issue #3372,
+created with this change in mind).
+---
+ src/compiler/nir/nir_constant_expressions.py | 106 +++++++++++++++++++
+ src/compiler/nir/nir_opcodes.py              |  44 ++++++++
+ 2 files changed, 150 insertions(+)
+
+diff --git a/src/compiler/nir/nir_constant_expressions.py b/src/compiler/nir/nir_constant_expressions.py
+index e6383b67737..46395d79a89 100644
+--- a/src/compiler/nir/nir_constant_expressions.py
++++ b/src/compiler/nir/nir_constant_expressions.py
+@@ -62,6 +62,8 @@ template = """\
+ #include "util/softfloat.h"
+ #include "util/bigmath.h"
+ #include "util/format/format_utils.h"
++#include "util/format_r11g11b10f.h"
++#include "util/u_math.h"
+ #include "nir_constant_expressions.h"
+ 
+ /**
+@@ -277,6 +279,110 @@ unpack_half_1x16(uint16_t u)
+    return _mesa_half_to_float(u);
+ }
+ 
++/* Broadcom v3d specific instructions */
++/**
++ * Packs 2 2x16 floating split into a r11g11b10f
++ */
++static uint32_t v11fpack_v3d(const uint32_t src0,
++                             const uint32_t src1)
++{
++   float rgb[3];
++
++   rgb[0] = unpack_half_1x16((src0 & 0xffff));
++   rgb[1] = unpack_half_1x16((src0 >> 16));
++   rgb[2] = unpack_half_1x16((src1 & 0xffff));
++
++   return float3_to_r11g11b10f(rgb);
++}
++
++/**
++  * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
++  * as it receives a uint16_t val instead of a float
++  */
++static uint8_t _mesa_half_to_snorm8(uint16_t val)
++{
++   float x = _mesa_half_to_float(val);
++
++   return pack_snorm_1x8(x);
++}
++
++static uint16_t _mesa_float_to_snorm16(uint32_t val)
++{
++   union fi aux;
++   aux.ui = val;
++   return pack_snorm_1x16(aux.f);
++}
++
++static uint16_t _mesa_float_to_unorm16(uint32_t val)
++{
++   union fi aux;
++   aux.ui = val;
++   return pack_unorm_1x16(aux.f);
++}
++
++/* FIXME: the implementation below of vftounorm10hi/lo is somewhat too
++ * verbose. It is likely that there would be a simpler way to implement
++ * it.
++ */
++static uint32_t float_pack16_v3d(uint32_t f32)
++{
++   float f = uif(f32);
++   return _mesa_float_to_half(f);
++}
++
++static uint32_t float_unpack16_v3d(uint32_t f16)
++{
++   float f = _mesa_half_to_float(f16);
++   return fui(f);
++}
++
++static uint32_t vfpack_v3d(uint32_t a, uint32_t b)
++{
++   return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
++}
++
++static  uint32_t vfsat_v3d(uint32_t a)
++{
++   return vfpack_v3d(
++      fui(SATURATE(_mesa_half_to_float(a & 0xffff))),
++      fui(SATURATE(_mesa_half_to_float(a >> 16))));
++}
++
++static uint32_t fmul_v3d(uint32_t a, uint32_t b)
++{
++   float f = uif(a);
++   float g = uif(b);
++
++   float x = f * g;
++
++   return fui(x);
++}
++
++#define L(x) float_unpack16_v3d((x) & 0xffff)
++#define H(x) float_unpack16_v3d((x) >> 16)
++#define V(f,a,b) vfpack_v3d(f(L(a), L(b)), f(H(a), H(b)))
++
++static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
++{
++   return V(fmul_v3d, a, b);
++}
++
++/* Convert 2x16-bit floating point to 2x10-bit unorm */
++static uint32_t vftounorm10lo(uint32_t src0)
++{
++   return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
++}
++
++/*
++ * Convert 2x16-bit floating point to one 2-bit and one
++ * 10-bit unorm
++ */
++static uint32_t vftounorm10hi(uint32_t src0)
++{
++   return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
++}
++
++
+ /* Some typed vector structures to make things like src0.y work */
+ typedef int8_t int1_t;
+ typedef uint8_t uint1_t;
+diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
+index e4d87aa6126..63aa7cfa315 100644
+--- a/src/compiler/nir/nir_opcodes.py
++++ b/src/compiler/nir/nir_opcodes.py
+@@ -1393,6 +1393,50 @@ for (int i = 0; i < 32; i += 8) {
+ }
+ """)
+ 
++# v3d-specific opcodes
++
++# v3d-specific (v71) instruction that packs parts of 2 2x16 floating point into
++# r11g11b10 bits, rounding to nearest even
++binop_convert("v11fpack_v3d", tuint32, tuint32, "",
++              "v11fpack_v3d(src0, src1)")
++
++# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
++# difference with pack_32_2x16_split is that the sources are 32bit too. So it
++# receives 2 32-bit integer, and pack the lower halfword as 2x16 on a 32-bit
++# pack.
++binop_horiz("vpack_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
++            "(src0.x & 0xffff) | (src1.x << 16)")
++
++# v3d-specific (v71) instruction that packs parts of 2 2x16 integers into r10g10b10a2
++binop_convert("v10pack_v3d", tuint32, tuint32, "",
++              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) << 20 | ((src1 >> 16) & 0x3ff) << 30")
++
++# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
++#   dst[7:0]   = src0[7:0]
++#   dst[15:8]  = src0[23:16]
++#   dst[23:16] = src1[7:0]
++#   dst[31:24] = src1[23:16]
++opcode("v8pack_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
++       False, "",
++       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) << 16 | (src1 & 0x00ff0000) << 8")
++
++# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit unorm/snorm
++unop("vftounorm8_v3d", tuint32,
++     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 8) << 16)")
++unop("vftosnorm8_v3d", tuint32,
++     "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 8) << 16)")
++
++# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit unorm/snorm
++unop("ftounorm16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
++unop("ftosnorm16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
++
++# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 bit unorm
++unop("vftounorm10lo_v3d", tuint32, "vftounorm10lo(src0)")
++
++# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 2-bit
++# and one 10 bit unorm
++unop("vftounorm10hi_v3d", tuint32, "vftounorm10hi(src0)")
++
+ # Mali-specific opcodes
+ unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
+ unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch b/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch
new file mode 100644
index 0000000000..911dd462a8
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0067-broadcom-compiler-update-image-store-lowering-to-use.patch
@@ -0,0 +1,452 @@
+From 381c29e3ff5237c89380cc53eb2271d1985f4e34 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 2 Dec 2021 13:26:43 +0100
+Subject: [PATCH 067/142] broadcom/compiler: update image store lowering to use
+ v71 new packing/conversion instructions
+
+Vulkan shaderdb stats with pattern dEQP-VK.image.*.with_format.*.*:
+   total instructions in shared programs: 35993 -> 33245 (-7.63%)
+   instructions in affected programs: 21153 -> 18405 (-12.99%)
+   helped: 394
+   HURT: 1
+   Instructions are helped.
+
+   total uniforms in shared programs: 8550 -> 7418 (-13.24%)
+   uniforms in affected programs: 5136 -> 4004 (-22.04%)
+   helped: 399
+   HURT: 0
+   Uniforms are helped.
+
+   total max-temps in shared programs: 6014 -> 5905 (-1.81%)
+   max-temps in affected programs: 473 -> 364 (-23.04%)
+   helped: 58
+   HURT: 0
+   Max-temps are helped.
+
+   total nops in shared programs: 1515 -> 1504 (-0.73%)
+   nops in affected programs: 46 -> 35 (-23.91%)
+   helped: 14
+   HURT: 2
+   Inconclusive result (%-change mean confidence interval includes 0).
+
+FWIW, that one HURT on the instructions count is for just one
+instruction.
+---
+ src/broadcom/compiler/nir_to_vir.c            |  39 +++
+ src/broadcom/compiler/v3d_compiler.h          |  16 +-
+ .../compiler/v3d_nir_lower_image_load_store.c | 246 +++++++++++++++++-
+ src/broadcom/compiler/vir.c                   |   2 +-
+ 4 files changed, 294 insertions(+), 9 deletions(-)
+
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index 90fe1d1e7f0..a8cf02dd386 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -1689,6 +1689,22 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
+                 result = vir_VFPACK(c, src[0], src[1]);
+                 break;
+ 
++        case nir_op_vpack_v3d:
++                result = vir_VPACK(c, src[0], src[1]);
++                break;
++
++        case nir_op_v11fpack_v3d:
++                result = vir_V11FPACK(c, src[0], src[1]);
++                break;
++
++        case nir_op_v10pack_v3d:
++                result = vir_V10PACK(c, src[0], src[1]);
++                break;
++
++        case nir_op_v8pack_v3d:
++                result = vir_V8PACK(c, src[0], src[1]);
++                break;
++
+         case nir_op_unpack_half_2x16_split_x:
+                 result = vir_FMOV(c, src[0]);
+                 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L);
+@@ -1719,6 +1735,29 @@ ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr)
+                 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero));
+                 break;
+         }
++        case nir_op_vftounorm8_v3d:
++                result = vir_VFTOUNORM8(c, src[0]);
++                break;
++
++        case nir_op_vftosnorm8_v3d:
++                result = vir_VFTOSNORM8(c, src[0]);
++                break;
++
++        case nir_op_vftounorm10lo_v3d:
++                result = vir_VFTOUNORM10LO(c, src[0]);
++                break;
++
++        case nir_op_vftounorm10hi_v3d:
++                result = vir_VFTOUNORM10HI(c, src[0]);
++                break;
++
++        case nir_op_ftounorm16_v3d:
++                result = vir_FTOUNORM16(c, src[0]);
++                break;
++
++        case nir_op_ftosnorm16_v3d:
++                result = vir_FTOSNORM16(c, src[0]);
++                break;
+ 
+         default:
+                 fprintf(stderr, "unknown NIR ALU inst: ");
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index 36adf8830b5..425ab0cdf9d 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -1186,7 +1186,7 @@ bool v3d_nir_lower_line_smooth(nir_shader *shader);
+ bool v3d_nir_lower_logic_ops(nir_shader *s, struct v3d_compile *c);
+ bool v3d_nir_lower_scratch(nir_shader *s);
+ bool v3d_nir_lower_txf_ms(nir_shader *s);
+-bool v3d_nir_lower_image_load_store(nir_shader *s);
++bool v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c);
+ bool v3d_nir_lower_load_store_bitsize(nir_shader *s);
+ 
+ void v3d33_vir_vpm_read_setup(struct v3d_compile *c, int num_components);
+@@ -1427,6 +1427,20 @@ VIR_SFU(LOG)
+ VIR_SFU(SIN)
+ VIR_SFU(RSQRT2)
+ 
++VIR_A_ALU2(VPACK)
++VIR_A_ALU2(V8PACK)
++VIR_A_ALU2(V10PACK)
++VIR_A_ALU2(V11FPACK)
++
++VIR_M_ALU1(FTOUNORM16)
++VIR_M_ALU1(FTOSNORM16)
++
++VIR_M_ALU1(VFTOUNORM8)
++VIR_M_ALU1(VFTOSNORM8)
++
++VIR_M_ALU1(VFTOUNORM10LO)
++VIR_M_ALU1(VFTOUNORM10HI)
++
+ static inline struct qinst *
+ vir_MOV_cond(struct v3d_compile *c, enum v3d_qpu_cond cond,
+              struct qreg dest, struct qreg src)
+diff --git a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+index 2900a29817f..bbb55be4a14 100644
+--- a/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
++++ b/src/broadcom/compiler/v3d_nir_lower_image_load_store.c
+@@ -40,6 +40,10 @@
+  * calculations and load/store using the TMU general memory access path.
+  */
+ 
++static const unsigned bits_8[4] = {8, 8, 8, 8};
++static const unsigned bits_16[4] = {16, 16, 16, 16};
++static const unsigned bits_1010102[4] = {10, 10, 10, 2};
++
+ bool
+ v3d_gl_format_is_return_32(enum pipe_format format)
+ {
+@@ -59,6 +63,8 @@ v3d_gl_format_is_return_32(enum pipe_format format)
+ 
+ /* Packs a 32-bit vector of colors in the range [0, (1 << bits[i]) - 1] to a
+  * 32-bit SSA value, with as many channels as necessary to store all the bits
++ *
++ * This is the generic helper, using all common nir operations.
+  */
+ static nir_ssa_def *
+ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+@@ -91,8 +97,185 @@ pack_bits(nir_builder *b, nir_ssa_def *color, const unsigned *bits,
+         return nir_vec(b, results, DIV_ROUND_UP(offset, 32));
+ }
+ 
++/* Utility wrapper as half_2x16_split is mapped to vfpack, and sometimes it is
++ * just easier to read vfpack on the code, specially while using the PRM as
++ * reference
++ */
++static nir_ssa_def *
++nir_vfpack(nir_builder *b, nir_ssa_def *p1, nir_ssa_def *p2)
++{
++        return nir_pack_half_2x16_split(b, p1, p2);
++}
++
++static inline nir_ssa_def *
++pack_11f11f10f(nir_builder *b, nir_ssa_def *color)
++{
++        nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
++                                     nir_channel(b, color, 1));
++        /* FIXME: we noted that we could just use p2 again as the second
++         * element to pack, and CTS tests still works. Just using undef as is
++         * slightly more correct
++         */
++        nir_ssa_def *undef = nir_ssa_undef(b, 1, color->bit_size);
++        nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2), undef);
++
++        return nir_v11fpack_v3d(b, p1, p2);
++}
++
++static inline nir_ssa_def *
++pack_r10g10b10a2_uint(nir_builder *b, nir_ssa_def *color)
++{
++        nir_ssa_def *p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
++                                        nir_channel(b, color, 1));
++        nir_ssa_def *p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
++                                        nir_channel(b, color, 3));
++
++        return nir_v10pack_v3d(b, p1, p2);
++}
++
++static inline nir_ssa_def *
++pack_r10g10b10a2_unorm(nir_builder *b, nir_ssa_def *color)
++{
++        nir_ssa_def *p1 = nir_vfpack(b, nir_channel(b, color, 0),
++                                     nir_channel(b, color, 1));
++        p1 = nir_vftounorm10lo_v3d(b, p1);
++
++        nir_ssa_def *p2 = nir_vfpack(b, nir_channel(b, color, 2),
++                                     nir_channel(b, color, 3));
++        p2 = nir_vftounorm10hi_v3d(b, p2);
++
++        return nir_v10pack_v3d(b, p1, p2);
++}
++
++enum hw_conversion {
++        NONE,
++        TO_SNORM,
++        TO_UNORM
++};
++
++static inline nir_ssa_def *
++pack_8bit(nir_builder *b, nir_ssa_def *color,
++                        unsigned num_components,
++                        enum hw_conversion conversion)
++{
++        /* Note that usually you should not use this method (that relies on
++         * custom packing) for 1 component if we are not doing any
++         * conversion. But we support also that case, and let the caller
++         * decide which method to use.
++         */
++        nir_ssa_def *p1;
++        nir_ssa_def *p2;
++
++        if (conversion == NONE) {
++                p1 = nir_vpack_v3d(b, nir_channel(b, color, 0),
++                                   nir_channel(b, color, num_components == 1 ? 0 : 1));
++        } else {
++                p1 = nir_vfpack(b, nir_channel(b, color, 0),
++                                nir_channel(b, color, num_components == 1 ? 0 : 1));
++                p1 = (conversion == TO_UNORM) ?
++                   nir_vftounorm8_v3d(b, p1) : nir_vftosnorm8_v3d(b, p1);
++        }
++        if (num_components == 4) {
++                if (conversion == NONE) {
++                        p2 = nir_vpack_v3d(b, nir_channel(b, color, 2),
++                                           nir_channel(b, color, 3));
++                } else {
++                        p2 = nir_vfpack(b, nir_channel(b, color, 2),
++                                        nir_channel(b, color, 3));
++                        p2 = (conversion == TO_UNORM) ?
++                           nir_vftounorm8_v3d(b, p2) : nir_vftosnorm8_v3d(b, p2);
++                }
++        } else {
++                /* As mentioned on the comment before, using an undef here
++                 * would be more correct. But for this case we are getting
++                 * worse values, and in fact even some worse instruction count
++                 * with some CTS tests, so we just reuse the first packing
++                 */
++                p2 = p1;
++        }
++
++        return nir_v8pack_v3d(b, p1, p2);
++}
++
++static inline nir_ssa_def *
++pack_16bit(nir_builder *b, nir_ssa_def *color,
++                         unsigned num_components,
++                         enum hw_conversion conversion)
++{
++        nir_ssa_def *results[2];
++        nir_ssa_def *channels[4];
++
++        /* Note that usually you should not use this method (that relies on
++         * custom packing) if we are not doing any conversion. But we support
++         * also that case, and let the caller decide which method to use.
++         */
++
++        for (unsigned i = 0; i < num_components; i++) {
++                channels[i] = nir_channel(b, color, i);
++                switch (conversion) {
++                case TO_SNORM:
++                        channels[i] = nir_ftosnorm16_v3d(b, channels[i]);
++                        break;
++                case TO_UNORM:
++                        channels[i] = nir_ftounorm16_v3d(b, channels[i]);
++                        break;
++                default:
++                        break;
++                }
++        }
++
++        switch (num_components) {
++        case 1:
++                results[0] = channels[0];
++                break;
++        case 4:
++                results[1] = nir_vpack_v3d(b, channels[2], channels[3]);
++                FALLTHROUGH;
++        case 2:
++                results[0] = nir_vpack_v3d(b, channels[0], channels[1]);
++                break;
++        }
++
++        return nir_vec(b, results, DIV_ROUND_UP(num_components, 2));
++}
++
++static inline nir_ssa_def *
++pack_xbit(nir_builder *b, nir_ssa_def *color,
++          unsigned num_components,
++          const struct util_format_channel_description *r_chan)
++{
++        bool pack_mask = (r_chan->type == UTIL_FORMAT_TYPE_SIGNED);
++        enum hw_conversion conversion = NONE;
++        if (r_chan->normalized) {
++                conversion =
++                        (r_chan->type == UTIL_FORMAT_TYPE_UNSIGNED) ? TO_UNORM : TO_SNORM;
++        }
++
++        switch (r_chan->size) {
++        case 8:
++                if (conversion == NONE && num_components < 2)
++                        return pack_bits(b, color, bits_8, num_components, pack_mask);
++                else
++                        return pack_8bit(b, color, num_components, conversion);
++                break;
++        case 16:
++                /* pack_mask implies that the generic packing method would
++                 * need to include extra operations to handle negative values,
++                 * so in that case, even without a conversion, it is better to
++                 * use the packing using custom hw operations.
++                 */
++                if (conversion == NONE && !pack_mask)
++                        return pack_bits(b, color, bits_16, num_components, pack_mask);
++                else
++                        return pack_16bit(b, color, num_components, conversion);
++                break;
++        default:
++                unreachable("unrecognized bits");
++        }
++}
++
+ static bool
+-v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
++v3d_nir_lower_image_store_v42(nir_builder *b, nir_intrinsic_instr *instr)
+ {
+         enum pipe_format format = nir_intrinsic_format(instr);
+         assert(format != PIPE_FORMAT_NONE);
+@@ -118,9 +301,6 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+                  */
+                 formatted = color;
+         } else {
+-                static const unsigned bits_8[4] = {8, 8, 8, 8};
+-                static const unsigned bits_16[4] = {16, 16, 16, 16};
+-                static const unsigned bits_1010102[4] = {10, 10, 10, 2};
+                 const unsigned *bits;
+ 
+                 switch (r_chan->size) {
+@@ -171,6 +351,52 @@ v3d_nir_lower_image_store(nir_builder *b, nir_intrinsic_instr *instr)
+         return true;
+ }
+ 
++
++static bool
++v3d_nir_lower_image_store_v71(nir_builder *b, nir_intrinsic_instr *instr)
++{
++        enum pipe_format format = nir_intrinsic_format(instr);
++        assert(format != PIPE_FORMAT_NONE);
++        const struct util_format_description *desc =
++                util_format_description(format);
++        const struct util_format_channel_description *r_chan = &desc->channel[0];
++        unsigned num_components = util_format_get_nr_components(format);
++        b->cursor = nir_before_instr(&instr->instr);
++
++        nir_ssa_def *color = nir_channels(b,
++                                          nir_ssa_for_src(b, instr->src[3], 4),
++                                          (1 << num_components) - 1);
++        nir_ssa_def *formatted = NULL;
++        if (format == PIPE_FORMAT_R9G9B9E5_FLOAT) {
++                formatted = nir_format_pack_r9g9b9e5(b, color);
++        } else if (format == PIPE_FORMAT_R11G11B10_FLOAT) {
++                formatted = pack_11f11f10f(b, color);
++        } else if (format == PIPE_FORMAT_R10G10B10A2_UINT) {
++                formatted = pack_r10g10b10a2_uint(b, color);
++        } else if (format == PIPE_FORMAT_R10G10B10A2_UNORM) {
++                formatted = pack_r10g10b10a2_unorm(b, color);
++        } else if (r_chan->size == 32) {
++                /* For 32-bit formats, we just have to move the vector
++                 * across (possibly reducing the number of channels).
++                 */
++                formatted = color;
++        } else if (r_chan->type == UTIL_FORMAT_TYPE_FLOAT) {
++                assert(r_chan->size == 16);
++                formatted = nir_format_float_to_half(b, color);
++                formatted = pack_bits(b, formatted, bits_16, num_components,
++                                      false);
++        } else {
++                assert(r_chan->size == 8 || r_chan->size == 16);
++                formatted = pack_xbit(b, color, num_components, r_chan);
++        }
++
++        nir_instr_rewrite_src(&instr->instr, &instr->src[3],
++                              nir_src_for_ssa(formatted));
++        instr->num_components = formatted->num_components;
++
++        return true;
++}
++
+ static bool
+ v3d_nir_lower_image_load(nir_builder *b, nir_intrinsic_instr *instr)
+ {
+@@ -215,11 +441,17 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
+         nir_intrinsic_instr *intr =
+                 nir_instr_as_intrinsic(instr);
+ 
++        struct v3d_compile *c = (struct v3d_compile *) _state;
++
+         switch (intr->intrinsic) {
+         case nir_intrinsic_image_load:
+                 return v3d_nir_lower_image_load(b, intr);
+         case nir_intrinsic_image_store:
+-                return v3d_nir_lower_image_store(b, intr);
++                if (c->devinfo->ver >= 71)
++                        return v3d_nir_lower_image_store_v71(b, intr);
++                else
++                        return v3d_nir_lower_image_store_v42(b, intr);
++                break;
+         default:
+                 return false;
+         }
+@@ -228,9 +460,9 @@ v3d_nir_lower_image_load_store_cb(nir_builder *b,
+ }
+ 
+ bool
+-v3d_nir_lower_image_load_store(nir_shader *s)
++v3d_nir_lower_image_load_store(nir_shader *s, struct v3d_compile *c)
+ {
+         return nir_shader_instructions_pass(s, v3d_nir_lower_image_load_store_cb,
+                                             nir_metadata_block_index |
+-                                            nir_metadata_dominance, NULL);
++                                            nir_metadata_dominance, c);
+ }
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index aea113f050e..7612eed7130 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -1576,7 +1576,7 @@ v3d_attempt_compile(struct v3d_compile *c)
+ 
+         NIR_PASS(_, c->s, v3d_nir_lower_io, c);
+         NIR_PASS(_, c->s, v3d_nir_lower_txf_ms);
+-        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store);
++        NIR_PASS(_, c->s, v3d_nir_lower_image_load_store, c);
+ 
+         NIR_PASS(_, c->s, nir_opt_idiv_const, 8);
+         nir_lower_idiv_options idiv_options = {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch b/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch
new file mode 100644
index 0000000000..1fe43abf8f
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0068-broadcom-compiler-don-t-allocate-spill-base-to-rf0-i.patch
@@ -0,0 +1,68 @@
+From f6082e941a3454c8735df2ff2713ae49b3daa74f Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 18 Apr 2023 08:50:13 +0200
+Subject: [PATCH 068/142] broadcom/compiler: don't allocate spill base to rf0
+ in V3D 7.x
+
+Otherwise it can be stomped by instructions doing implicit rf0 writes.
+---
+ src/broadcom/compiler/vir_register_allocate.c | 21 +++++++++++++++----
+ 1 file changed, 17 insertions(+), 4 deletions(-)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 440b093a636..121c9b2794f 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -582,7 +582,8 @@ interferes(int32_t t0_start, int32_t t0_end, int32_t t1_start, int32_t t1_end)
+ }
+ 
+ static void
+-v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
++v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int *implicit_rf_nodes,
++              int spill_temp)
+ {
+         c->spill_start_num_temps = c->num_temps;
+         c->spilling = true;
+@@ -594,8 +595,20 @@ v3d_spill_reg(struct v3d_compile *c, int *acc_nodes, int spill_temp)
+                 spill_offset = c->spill_size;
+                 c->spill_size += V3D_CHANNELS * sizeof(uint32_t);
+ 
+-                if (spill_offset == 0)
++                if (spill_offset == 0) {
+                         v3d_setup_spill_base(c);
++
++                        /* Don't allocate our spill base to rf0 to avoid
++                         * conflicts with instructions doing implicit writes
++                         * to that register.
++                         */
++                        if (!c->devinfo->has_accumulators) {
++                                ra_add_node_interference(
++                                        c->g,
++                                        temp_to_node(c, c->spill_base.index),
++                                        implicit_rf_nodes[0]);
++                        }
++                }
+         }
+ 
+         struct qinst *last_thrsw = c->last_thrsw;
+@@ -1346,7 +1359,7 @@ v3d_register_allocate(struct v3d_compile *c)
+                         int node = v3d_choose_spill_node(c);
+                         uint32_t temp = node_to_temp(c, node);
+                         if (node != -1) {
+-                                v3d_spill_reg(c, acc_nodes, temp);
++                                v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                                 continue;
+                         }
+                 }
+@@ -1363,7 +1376,7 @@ v3d_register_allocate(struct v3d_compile *c)
+                 enum temp_spill_type spill_type =
+                         get_spill_type_for_temp(c, temp);
+                 if (spill_type != SPILL_TYPE_TMU || tmu_spilling_allowed(c)) {
+-                        v3d_spill_reg(c, acc_nodes, temp);
++                        v3d_spill_reg(c, acc_nodes, implicit_rf_nodes, temp);
+                         if (c->spills + c->fills > c->max_tmu_spills)
+                                 goto spill_fail;
+                 } else {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch b/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch
new file mode 100644
index 0000000000..fb73352b1a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0069-broadcom-compiler-improve-allocation-for-final-progr.patch
@@ -0,0 +1,186 @@
+From 0e9577fbb18a026390f653ca22f5a98a69a5fe59 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 2 May 2023 10:12:37 +0200
+Subject: [PATCH 069/142] broadcom/compiler: improve allocation for final
+ program instructions
+
+The last 3 instructions can't use specific registers so flag all the
+nodes for temps used in the last program instructions and try to
+avoid assigning any of these. This may help us avoid injecting nops
+for the last thread switch instruction.
+
+Because regisster allocation needs to happen before QPU scheduling
+and instruction merging we can't tell exactly what the last 3
+instructions will be, so we do this for a few more instructions than
+just 3.
+
+We only do this for fragment shaders because other shader stages
+always end with VPM store instructions that take an small immediate
+and therefore will never allow us to merge the final thread switch
+earlier, so limiting allocation for these shaders will never improve
+anything and might instead be detrimental.
+
+total instructions in shared programs: 11471389 -> 11464335 (-0.06%)
+instructions in affected programs: 582908 -> 575854 (-1.21%)
+helped: 4669
+HURT: 578
+Instructions are helped.
+
+total max-temps in shared programs: 2230497 -> 2230150 (-0.02%)
+max-temps in affected programs: 5662 -> 5315 (-6.13%)
+helped: 344
+HURT: 44
+Max-temps are helped.
+
+total sfu-stalls in shared programs: 18068 -> 18077 (0.05%)
+sfu-stalls in affected programs: 264 -> 273 (3.41%)
+helped: 37
+HURT: 48
+Inconclusive result (value mean confidence interval includes 0).
+
+total inst-and-stalls in shared programs: 11489457 -> 11482412 (-0.06%)
+inst-and-stalls in affected programs: 585180 -> 578135 (-1.20%)
+helped: 4659
+HURT: 588
+Inst-and-stalls are helped.
+
+total nops in shared programs: 301738 -> 298140 (-1.19%)
+nops in affected programs: 14680 -> 11082 (-24.51%)
+helped: 3252
+HURT: 108
+Nops are helped.
+---
+ src/broadcom/compiler/v3d_compiler.h          |  1 +
+ src/broadcom/compiler/vir_register_allocate.c | 69 +++++++++++++++++--
+ 2 files changed, 66 insertions(+), 4 deletions(-)
+
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index 425ab0cdf9d..2642d23b629 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -613,6 +613,7 @@ struct v3d_ra_node_info {
+         struct {
+                 uint32_t priority;
+                 uint8_t class_bits;
++                bool is_program_end;
+ 
+                 /* V3D 7.x */
+                 bool is_ldunif_dst;
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 121c9b2794f..495644bb557 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -385,6 +385,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+         c->nodes.info[node].class_bits = class_bits;
+         c->nodes.info[node].priority = 0;
+         c->nodes.info[node].is_ldunif_dst = false;
++        c->nodes.info[node].is_program_end = false;
+ }
+ 
+ /* The spill offset for this thread takes a bit of setup, so do it once at
+@@ -929,6 +930,17 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 return true;
+         }
+ 
++        /* The last 3 instructions in a shader can't use some specific registers
++         * (usually early rf registers, depends on v3d version) so try to
++         * avoid allocating these to registers used by the last instructions
++         * in the shader.
++         */
++        const uint32_t safe_rf_start = v3d_ra->devinfo->ver <= 42 ? 3 : 4;
++        if (v3d_ra->nodes->info[node].is_program_end &&
++            v3d_ra->next_phys < safe_rf_start) {
++                v3d_ra->next_phys = safe_rf_start;
++        }
++
+         for (int i = 0; i < PHYS_COUNT; i++) {
+                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
+                 int phys = v3d_ra->phys_index + phys_off;
+@@ -1218,6 +1230,44 @@ update_graph_and_reg_classes_for_inst(struct v3d_compile *c,
+         }
+ }
+ 
++static void
++flag_program_end_nodes(struct v3d_compile *c)
++{
++        /* Only look for registers used in this many instructions */
++        uint32_t last_set_count = 6;
++
++        struct qblock *last_block = vir_exit_block(c);
++        list_for_each_entry_rev(struct qinst, inst, &last_block->instructions, link) {
++                if (!inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU)
++                        continue;
++
++                int num_src = v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
++                for (int i = 0; i < num_src; i++) {
++                        if (inst->src[i].file == QFILE_TEMP) {
++                                int node = temp_to_node(c, inst->src[i].index);
++                                c->nodes.info[node].is_program_end = true;
++                        }
++                }
++
++                num_src = v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
++                for (int i = 0; i < num_src; i++) {
++                       if (inst->src[i].file == QFILE_TEMP) {
++                                int node = temp_to_node(c, inst->src[i].index);
++                                c->nodes.info[node].is_program_end = true;
++
++                        }
++                }
++
++                if (inst->dst.file == QFILE_TEMP) {
++                        int node = temp_to_node(c, inst->dst.index);
++                        c->nodes.info[node].is_program_end = true;
++                }
++
++                if (--last_set_count == 0)
++                        break;
++        }
++}
++
+ /**
+  * Returns a mapping from QFILE_TEMP indices to struct qpu_regs.
+  *
+@@ -1280,17 +1330,16 @@ v3d_register_allocate(struct v3d_compile *c)
+          */
+         for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                 c->nodes.info[i].is_ldunif_dst = false;
++                c->nodes.info[i].is_program_end = false;
++                c->nodes.info[i].priority = 0;
++                c->nodes.info[i].class_bits = 0;
+                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+                         acc_nodes[i] = i;
+                         ra_set_node_reg(c->g, acc_nodes[i], ACC_INDEX + i);
+-                        c->nodes.info[i].priority = 0;
+-                        c->nodes.info[i].class_bits = 0;
+                 } else if (!c->devinfo->has_accumulators &&
+                            i < ARRAY_SIZE(implicit_rf_nodes)) {
+                         implicit_rf_nodes[i] = i;
+                         ra_set_node_reg(c->g, implicit_rf_nodes[i], phys_index + i);
+-                        c->nodes.info[i].priority = 0;
+-                        c->nodes.info[i].class_bits = 0;
+                 } else {
+                         uint32_t t = node_to_temp(c, i);
+                         c->nodes.info[i].priority =
+@@ -1327,6 +1376,18 @@ v3d_register_allocate(struct v3d_compile *c)
+                                                       last_ldvary_ip, inst);
+         }
+ 
++        /* Flag the nodes that are used in the last instructions of the program
++         * (there are some registers that cannot be used in the last 3
++         * instructions). We only do this for fragment shaders, because the idea
++         * is that by avoiding this conflict we may be able to emit the last
++         * thread switch earlier in some cases, however, in non-fragment shaders
++         * this won't happen because the last instructions are always VPM stores
++         * with a small immediate, which conflicts with other signals,
++         * preventing us from ever moving the thrsw earlier.
++         */
++        if (c->s->info.stage == MESA_SHADER_FRAGMENT)
++                flag_program_end_nodes(c);
++
+         /* Set the register classes for all our temporaries in the graph */
+         for (uint32_t i = 0; i < c->num_temps; i++) {
+                 ra_set_node_class(c->g, temp_to_node(c, i),
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch b/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch
new file mode 100644
index 0000000000..1b29439b82
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0070-broadcom-compiler-don-t-assign-registers-to-unused-n.patch
@@ -0,0 +1,105 @@
+From 645fe451bcecbe3345a144222306d06fb39f6b9f Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 2 May 2023 10:17:47 +0200
+Subject: [PATCH 070/142] broadcom/compiler: don't assign registers to unused
+ nodes/temps
+
+In programs with a lot of unused temps, if we don't do this, we may
+end up recycling previously used rfs more often, which can be
+detrimental to instruction pairing.
+
+total instructions in shared programs: 11464335 -> 11444136 (-0.18%)
+instructions in affected programs: 8976743 -> 8956544 (-0.23%)
+helped: 33196
+HURT: 33778
+Inconclusive result
+
+total max-temps in shared programs: 2230150 -> 2229445 (-0.03%)
+max-temps in affected programs: 86413 -> 85708 (-0.82%)
+helped: 2217
+HURT: 1523
+Max-temps are helped.
+
+total sfu-stalls in shared programs: 18077 -> 17104 (-5.38%)
+sfu-stalls in affected programs: 8669 -> 7696 (-11.22%)
+helped: 2657
+HURT: 2182
+Sfu-stalls are helped.
+
+total inst-and-stalls in shared programs: 11482412 -> 11461240 (-0.18%)
+inst-and-stalls in affected programs: 8995697 -> 8974525 (-0.24%)
+helped: 33319
+HURT: 33708
+Inconclusive result
+
+total nops in shared programs: 298140 -> 296185 (-0.66%)
+nops in affected programs: 52805 -> 50850 (-3.70%)
+helped: 3797
+HURT: 2662
+Inconclusive result
+---
+ src/broadcom/compiler/v3d_compiler.h          |  1 +
+ src/broadcom/compiler/vir_register_allocate.c | 14 ++++++++++++++
+ 2 files changed, 15 insertions(+)
+
+diff --git a/src/broadcom/compiler/v3d_compiler.h b/src/broadcom/compiler/v3d_compiler.h
+index 2642d23b629..f1a807e38fd 100644
+--- a/src/broadcom/compiler/v3d_compiler.h
++++ b/src/broadcom/compiler/v3d_compiler.h
+@@ -614,6 +614,7 @@ struct v3d_ra_node_info {
+                 uint32_t priority;
+                 uint8_t class_bits;
+                 bool is_program_end;
++                bool unused;
+ 
+                 /* V3D 7.x */
+                 bool is_ldunif_dst;
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 495644bb557..0ab0474424f 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -386,6 +386,7 @@ add_node(struct v3d_compile *c, uint32_t temp, uint8_t class_bits)
+         c->nodes.info[node].priority = 0;
+         c->nodes.info[node].is_ldunif_dst = false;
+         c->nodes.info[node].is_program_end = false;
++        c->nodes.info[node].unused = false;
+ }
+ 
+ /* The spill offset for this thread takes a bit of setup, so do it once at
+@@ -918,6 +919,12 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                  BITSET_WORD *regs,
+                  unsigned int *out)
+ {
++        /* If this node is for an unused temp, ignore. */
++        if (v3d_ra->nodes->info[node].unused) {
++                *out = 0;
++                return true;
++        }
++
+         /* In V3D 7.x, try to assign rf0 to temps used as ldunif's dst
+          * so we can avoid turning them into ldunifrf (which uses the
+          * cond field to encode the dst and would prevent merge with
+@@ -1331,6 +1338,7 @@ v3d_register_allocate(struct v3d_compile *c)
+         for (uint32_t i = 0; i < num_ra_nodes; i++) {
+                 c->nodes.info[i].is_ldunif_dst = false;
+                 c->nodes.info[i].is_program_end = false;
++                c->nodes.info[i].unused = false;
+                 c->nodes.info[i].priority = 0;
+                 c->nodes.info[i].class_bits = 0;
+                 if (c->devinfo->has_accumulators && i < ACC_COUNT) {
+@@ -1396,6 +1404,12 @@ v3d_register_allocate(struct v3d_compile *c)
+ 
+         /* Add register interferences based on liveness data */
+         for (uint32_t i = 0; i < c->num_temps; i++) {
++                /* And while we are here, let's also flag nodes for
++                 * unused temps.
++                 */
++                if (c->temp_start[i] > c->temp_end[i])
++                        c->nodes.info[temp_to_node(c, i)].unused = true;
++
+                 for (uint32_t j = i + 1; j < c->num_temps; j++) {
+                         if (interferes(c->temp_start[i], c->temp_end[i],
+                                        c->temp_start[j], c->temp_end[j])) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch b/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch
new file mode 100644
index 0000000000..1ff6366faa
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0071-broadcom-compiler-only-assign-rf0-as-last-resort-in-.patch
@@ -0,0 +1,83 @@
+From 851704169d59e28c5429b06d05e5ef952be893a2 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 15 May 2023 10:02:10 +0200
+Subject: [PATCH 071/142] broadcom/compiler: only assign rf0 as last resort in
+ V3D 7.x
+
+So we can use it for ldunif(a) and avoid generating ldunif(a)rf which
+can't be paired with conditional instructions.
+
+shader-db (pi5):
+
+total instructions in shared programs: 11357802 -> 11338883 (-0.17%)
+instructions in affected programs: 7117889 -> 7098970 (-0.27%)
+helped: 24264
+HURT: 17574
+Instructions are helped.
+
+total uniforms in shared programs: 3857808 -> 3857815 (<.01%)
+uniforms in affected programs: 92 -> 99 (7.61%)
+helped: 0
+HURT: 1
+
+total max-temps in shared programs: 2230904 -> 2230199 (-0.03%)
+max-temps in affected programs: 52309 -> 51604 (-1.35%)
+helped: 1219
+HURT: 725
+Max-temps are helped.
+
+total sfu-stalls in shared programs: 15021 -> 15236 (1.43%)
+sfu-stalls in affected programs: 6848 -> 7063 (3.14%)
+helped: 1866
+HURT: 1704
+Inconclusive result
+
+total inst-and-stalls in shared programs: 11372823 -> 11354119 (-0.16%)
+inst-and-stalls in affected programs: 7149177 -> 7130473 (-0.26%)
+helped: 24315
+HURT: 17561
+Inst-and-stalls are helped.
+
+total nops in shared programs: 273624 -> 273711 (0.03%)
+nops in affected programs: 31562 -> 31649 (0.28%)
+helped: 1619
+HURT: 1854
+Inconclusive result (value mean confidence interval includes 0).
+---
+ src/broadcom/compiler/vir_register_allocate.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/src/broadcom/compiler/vir_register_allocate.c b/src/broadcom/compiler/vir_register_allocate.c
+index 0ab0474424f..8eac2b75bd7 100644
+--- a/src/broadcom/compiler/vir_register_allocate.c
++++ b/src/broadcom/compiler/vir_register_allocate.c
+@@ -950,6 +950,11 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+ 
+         for (int i = 0; i < PHYS_COUNT; i++) {
+                 int phys_off = (v3d_ra->next_phys + i) % PHYS_COUNT;
++
++                /* Try to keep rf0 available for ldunif in 7.x (see above). */
++                if (v3d_ra->devinfo->ver >= 71 && phys_off == 0)
++                        continue;
++
+                 int phys = v3d_ra->phys_index + phys_off;
+ 
+                 if (BITSET_TEST(regs, phys)) {
+@@ -959,6 +964,14 @@ v3d_ra_select_rf(struct v3d_ra_select_callback_data *v3d_ra,
+                 }
+         }
+ 
++        /* If we couldn't allocate, do try to assign rf0 if it is available. */
++        if (v3d_ra->devinfo->ver >= 71 &&
++            BITSET_TEST(regs, v3d_ra->phys_index)) {
++                v3d_ra->next_phys = 1;
++                *out = v3d_ra->phys_index;
++                return true;
++        }
++
+         return false;
+ }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch b/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch
new file mode 100644
index 0000000000..2fcd20415f
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0072-v3dv-recover-non-conformant-warning-for-not-fully-su.patch
@@ -0,0 +1,30 @@
+From 0d3fd30d67ffc0195b0783e30ab6afbbe403310a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 28 Apr 2021 14:31:38 +0200
+Subject: [PATCH 072/142] v3dv: recover non-conformant warning for not fully
+ supported hw
+
+---
+ src/broadcom/vulkan/v3dv_device.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index d5de3517670..d29ffad3531 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -1212,6 +1212,12 @@ create_physical_device(struct v3dv_instance *instance,
+ 
+    list_addtail(&device->vk.link, &instance->vk.physical_devices.list);
+ 
++   if (device->devinfo.ver != 42) {
++      fprintf(stderr, "WARNING: v3dv support for hw version %i is neither "
++              "a complete nor a conformant Vulkan implementation. Testing "
++              "use only.\n", device->devinfo.ver);
++   }
++
+    return VK_SUCCESS;
+ 
+ fail:
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch b/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch
new file mode 100644
index 0000000000..8023c45736
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0073-v3dv-meson-add-v71-hw-generation.patch
@@ -0,0 +1,504 @@
+From 52b5ac62b367ae89574c8031fdcf7c1dae05c942 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 29 Jun 2021 11:59:53 +0200
+Subject: [PATCH 073/142] v3dv/meson: add v71 hw generation
+
+Starting point for v71 version inclusion.
+
+This just adds it as one of the versions to be compiled (on meson),
+updates the v3dX/v3dv_X macros, and update the code enough to get it
+compiling when building using the two versions. For any packet not
+available on v71 we just provide a generic asserted placeholder of
+generation not supported.
+
+Any real v71 support will be implemented on following commits.
+---
+ src/broadcom/vulkan/meson.build         |  6 +-
+ src/broadcom/vulkan/v3dv_private.h      |  7 +++
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c  | 75 +++++++++++++++++++++++--
+ src/broadcom/vulkan/v3dvx_image.c       | 16 +++++-
+ src/broadcom/vulkan/v3dvx_meta_common.c | 32 +++++++++++
+ src/broadcom/vulkan/v3dvx_pipeline.c    |  5 ++
+ src/broadcom/vulkan/v3dvx_queue.c       | 11 ++++
+ 7 files changed, 142 insertions(+), 10 deletions(-)
+
+diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
+index ad032d832ad..3da7364686f 100644
+--- a/src/broadcom/vulkan/meson.build
++++ b/src/broadcom/vulkan/meson.build
+@@ -27,6 +27,7 @@ v3dv_entrypoints = custom_target(
+     '--out-h', '@OUTPUT0@', '--out-c', '@OUTPUT1@', '--prefix', 'v3dv',
+     '--beta', with_vulkan_beta.to_string(),
+     '--device-prefix', 'ver42',
++    '--device-prefix', 'ver71',
+   ],
+   depend_files : vk_entrypoints_gen_depend_files,
+ )
+@@ -67,10 +68,7 @@ files_per_version = files(
+   'v3dvx_queue.c',
+ )
+ 
+-# The vulkan driver only supports version >= 42, which is the version present in
+-# Rpi4. We need to explicitly set it as we are reusing pieces from the GL v3d
+-# driver.
+-v3d_versions = ['42']
++v3d_versions = ['42', '71']
+ 
+ v3dv_flags = []
+ 
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index c6707211529..6bdf338c67b 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -2608,6 +2608,9 @@ u64_compare(const void *key1, const void *key2)
+    case 42:                                           \
+       v3d_X_thing = &v3d42_##thing;                   \
+       break;                                          \
++   case 71:                                           \
++      v3d_X_thing = &v3d71_##thing;                   \
++      break;                                          \
+    default:                                           \
+       unreachable("Unsupported hardware generation"); \
+    }                                                  \
+@@ -2626,6 +2629,10 @@ u64_compare(const void *key1, const void *key2)
+ #  define v3dX(x) v3d42_##x
+ #  include "v3dvx_private.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dvx_private.h"
++#  undef v3dX
+ #endif
+ 
+ #ifdef ANDROID
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index f182b790d36..b958e634c82 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -56,10 +56,15 @@ v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
+    };
+    config.width_in_pixels = tiling->width;
+    config.height_in_pixels = tiling->height;
++#if V3D_VERSION == 42
+    config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+    config.multisample_mode_4x = tiling->msaa;
+    config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+    config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+    uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
+    cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
+@@ -82,10 +87,15 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
+    cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+       config.width_in_pixels = tiling->width;
+       config.height_in_pixels = tiling->height;
++#if V3D_VERSION == 42
+       config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+    }
+ 
+    /* There's definitely nothing in the VCD cache we want. */
+@@ -649,10 +659,15 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+     * bit and instead we have to emit a single clear of all tile buffers.
+     */
+    if (use_global_zs_clear || use_global_rt_clear) {
++#if V3D_VERSION == 42
+       cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+          clear.clear_z_stencil_buffer = use_global_zs_clear;
+          clear.clear_all_render_targets = use_global_rt_clear;
+       }
++#endif
++#if V3D_VERSION >= 71
++      unreachable("Hardware generation 71 not supported yet.");
++#endif
+    }
+ }
+ 
+@@ -824,7 +839,12 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       config.number_of_render_targets = MAX2(subpass->color_count, 1);
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+          const struct v3dv_image_view *iview =
+@@ -920,7 +940,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       const struct v3d_resource_slice *slice =
+          &image->planes[plane].slices[iview->vk.base_mip_level];
+ 
+-      const uint32_t *clear_color =
++      UNUSED const uint32_t *clear_color =
+          &state->attachments[attachment_idx].clear_value.color[0];
+ 
+       uint32_t clear_pad = 0;
+@@ -937,13 +957,19 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          }
+       }
+ 
++#if V3D_VERSION == 42
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+          clear.clear_color_low_32_bits = clear_color[0];
+          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
+          clear.render_target_number = i;
+       };
++#endif
++#if V3D_VERSION >= 71
++         unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+             clear.clear_color_mid_low_32_bits =
+                ((clear_color[1] >> 24) | (clear_color[2] << 8));
+@@ -951,17 +977,28 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+                ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
+             clear.render_target_number = i;
+          };
++#endif
++#if V3D_VERSION >= 71
++         unreachable("HW generation 71 not supported yet.");
++#endif
++
+       }
+ 
+       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+             clear.uif_padded_height_in_uif_blocks = clear_pad;
+             clear.clear_color_high_16_bits = clear_color[3] >> 16;
+             clear.render_target_number = i;
+          };
++#endif
++#if V3D_VERSION >= 71
++         unreachable("HW generation 71 not supported yet.");
++#endif
+       }
+    }
+ 
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       v3dX(cmd_buffer_render_pass_setup_render_target)
+          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
+@@ -976,6 +1013,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
+           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
+    }
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
+ 
+    /* Ends rendering mode config. */
+    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+@@ -1036,10 +1077,15 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       }
+       if (cmd_buffer->state.tile_aligned_render_area &&
+           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+             clear.clear_z_stencil_buffer = !job->early_zs_clear;
+             clear.clear_all_render_targets = true;
+          }
++#endif
++#if V3D_VERSION >= 71
++         unreachable("HW generation 71 not supported yet.");
++#endif
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+    }
+@@ -1065,7 +1111,9 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+     * now, would need to change if we allow multiple viewports
+     */
+    float *vptranslate = dynamic->viewport.translate[0];
++#if V3D_VERSION == 42
+    float *vpscale = dynamic->viewport.scale[0];
++#endif
+ 
+    struct v3dv_job *job = cmd_buffer->state.job;
+    assert(job);
+@@ -1078,10 +1126,15 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+    v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
+    v3dv_return_if_oom(cmd_buffer, NULL);
+ 
++#if V3D_VERSION == 42
+    cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+       clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
+       clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
+    }
++#endif
++#if V3D_VERSION >= 71
++   unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+    float translate_z, scale_z;
+    v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
+@@ -1591,16 +1644,20 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    assert(pipeline);
+ 
+-   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+-
+    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
+    v3dv_return_if_oom(cmd_buffer, NULL);
+ 
++#if V3D_VERSION == 42
++   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
+       config.early_z_enable = enable_ez;
+       config.early_z_updates_enable = config.early_z_enable &&
+          pipeline->z_updates_enable;
+    }
++#endif
++#if V3D_VERSION >= 71
++   unreachable("HW generation 71 not supported yet.");
++#endif
+ }
+ 
+ void
+@@ -2031,10 +2088,12 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
+                                 pipeline->vpm_cfg.Gv);
+    }
+ 
++#if V3D_VERSION == 42
+    struct v3dv_bo *default_attribute_values =
+       pipeline->default_attribute_values != NULL ?
+       pipeline->default_attribute_values :
+       pipeline->device->default_attribute_float;
++#endif
+ 
+    cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
+                           pipeline->shader_state_record, shader) {
+@@ -2060,8 +2119,10 @@ v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
+       shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
+       shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
+ 
++#if V3D_VERSION == 42
+       shader.address_of_default_attribute_values =
+          v3dv_cl_address(default_attribute_values, 0);
++#endif
+ 
+       shader.any_shader_reads_hardware_written_primitive_id =
+          (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
+@@ -2399,11 +2460,17 @@ v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buf
+ 
+    assert(iview->plane_count == 1);
+    *rt_bpp = iview->planes[0].internal_bpp;
+-   *rt_type = iview->planes[0].internal_type;
+    if (vk_format_is_int(iview->vk.view_format))
++#if V3D_VERSION == 42
++   *rt_type = iview->planes[0].internal_type;
++   if (vk_format_is_int(iview->vk.format))
+       *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+    else if (vk_format_is_srgb(iview->vk.view_format))
+       *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+    else
+       *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
++#endif
++#if V3D_VERSION >= 71
++   unreachable("HW generation 71 not supported yet.");
++#endif
+ }
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index 80a3e5bfde8..dac6ff2741f 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -76,8 +76,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+          tex.swizzle_b = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[2]);
+          tex.swizzle_a = v3d_translate_pipe_swizzle(image_view->planes[plane].swizzle[3]);
+ 
+-         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
+-
+          tex.texture_type = image_view->format->planes[plane].tex_type;
+ 
+          if (image->vk.image_type == VK_IMAGE_TYPE_3D) {
+@@ -110,7 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+ 
+          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+ 
++#if V3D_VERSION == 42
++         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
++#endif
++
++#if V3D_VERSION == 42
+          tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
++#endif
++#if V3D_VERSION >= 71
++      unreachable("Hardware generation 71 not supported yet.");
++#endif
+ 
+          /* At this point we don't have the job. That's the reason the first
+           * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+@@ -166,7 +173,12 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+ 
+       assert(buffer_view->format->plane_count == 1);
+       tex.texture_type = buffer_view->format->planes[0].tex_type;
++#if V3D_VERSION == 42
+       tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
++#endif
++#if V3D_VERSION >= 71
++      unreachable("Hardware generation 71 not supported yet.");
++#endif
+ 
+       /* At this point we don't have the job. That's the reason the first
+        * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index 04147b82cbd..2db07ea7427 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -58,7 +58,12 @@ emit_rcl_prologue(struct v3dv_job *job,
+       config.number_of_render_targets = 1;
+       config.multisample_mode_4x = tiling->msaa;
+       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("Hardware generation 71 not supported yet.");
++#endif
+       config.internal_depth_type = fb->internal_depth_type;
+    }
+ 
+@@ -88,14 +93,20 @@ emit_rcl_prologue(struct v3dv_job *job,
+          }
+       }
+ 
++#if V3D_VERSION == 42
+       const uint32_t *color = &clear_info->clear_value->color[0];
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+          clear.clear_color_low_32_bits = color[0];
+          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
+          clear.render_target_number = 0;
+       };
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
+ 
+       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+             clear.clear_color_mid_low_32_bits =
+               ((color[1] >> 24) | (color[2] << 8));
+@@ -103,22 +114,37 @@ emit_rcl_prologue(struct v3dv_job *job,
+               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
+             clear.render_target_number = 0;
+          };
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
++
+       }
+ 
+       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+             clear.uif_padded_height_in_uif_blocks = clear_pad;
+             clear.clear_color_high_16_bits = color[3] >> 16;
+             clear.render_target_number = 0;
+          };
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
+       }
+    }
+ 
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       rt.render_target_0_internal_bpp = tiling->internal_bpp;
+       rt.render_target_0_internal_type = fb->internal_type;
+       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+    }
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
+@@ -179,10 +205,16 @@ emit_frame_setup(struct v3dv_job *job,
+        */
+       if (clear_value &&
+           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
++#if V3D_VERSION == 42
+          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
+             clear.clear_z_stencil_buffer = true;
+             clear.clear_all_render_targets = true;
+          }
++#endif
++#if V3D_VERSION >= 71
++      unreachable("Hardware generation 71 not supported yet.");
++#endif
++
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+    }
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 5d32d414ed8..922698b08a2 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -447,10 +447,15 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+       /* FIXME: Use combined input/output size flag in the common case (also
+        * on v3d, see v3dx_draw).
+        */
++#if V3D_VERSION == 42
+       shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs_bin->separate_segments;
+       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs->separate_segments;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+       shader.coordinate_shader_input_vpm_segment_size =
+          prog_data_vs_bin->separate_segments ?
+diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
+index efe63de425c..1a26d04aef7 100644
+--- a/src/broadcom/vulkan/v3dvx_queue.c
++++ b/src/broadcom/vulkan/v3dvx_queue.c
+@@ -42,14 +42,25 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
+       config.image_height_pixels = 1;
+       config.number_of_render_targets = 1;
+       config.multisample_mode_4x = false;
++#if V3D_VERSION == 42
+       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
++#endif
++#if V3D_VERSION >= 71
++      unreachable("HW generation 71 not supported yet.");
++#endif
+    }
+ 
++#if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       rt.render_target_0_internal_bpp = V3D_INTERNAL_BPP_32;
+       rt.render_target_0_internal_type = V3D_INTERNAL_TYPE_8;
+       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+    }
++#endif
++#if V3D_VERSION >= 71
++   unreachable("Hardware generation 71 not supported yet.");
++#endif
++
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+       clear.z_clear_value = 1.0f;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch b/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch
new file mode 100644
index 0000000000..3b3626dda1
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0074-v3dv-expose-V3D-revision-number-in-device-name.patch
@@ -0,0 +1,29 @@
+From 7aa016bca8bb1bf449ea79505692353c0bd174b8 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 10 Nov 2021 10:06:50 +0100
+Subject: [PATCH 074/142] v3dv: expose V3D revision number in device name
+
+---
+ src/broadcom/vulkan/v3dv_device.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index d29ffad3531..3034b561480 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -1123,8 +1123,10 @@ create_physical_device(struct v3dv_instance *instance,
+    device->next_program_id = 0;
+ 
+    ASSERTED int len =
+-      asprintf(&device->name, "V3D %d.%d",
+-               device->devinfo.ver / 10, device->devinfo.ver % 10);
++      asprintf(&device->name, "V3D %d.%d.%d",
++               device->devinfo.ver / 10,
++               device->devinfo.ver % 10,
++               device->devinfo.rev);
+    assert(len != -1);
+ 
+    v3dv_physical_device_init_disk_cache(device);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch b/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch
new file mode 100644
index 0000000000..249a11c141
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0075-v3dv-device-handle-new-rpi5-device-bcm2712.patch
@@ -0,0 +1,54 @@
+From fb9e95b7e1d5987fd25e914635c4e09d81ea9561 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 10 Nov 2021 07:54:35 +0100
+Subject: [PATCH 075/142] v3dv/device: handle new rpi5 device (bcm2712)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+This includes both master and primary devices.
+
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+---
+ src/broadcom/vulkan/v3dv_device.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index 3034b561480..c8719d33f15 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -1287,7 +1287,8 @@ enumerate_devices(struct vk_instance *vk_instance)
+       if (devices[i]->available_nodes & 1 << DRM_NODE_RENDER) {
+          char **compat = devices[i]->deviceinfo.platform->compatible;
+          while (*compat) {
+-            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0) {
++            if (strncmp(*compat, "brcm,2711-v3d", 13) == 0 ||
++                strncmp(*compat, "brcm,2712-v3d", 13) == 0) {
+                v3d_idx = i;
+                break;
+             }
+@@ -1296,8 +1297,9 @@ enumerate_devices(struct vk_instance *vk_instance)
+       } else if (devices[i]->available_nodes & 1 << DRM_NODE_PRIMARY) {
+          char **compat = devices[i]->deviceinfo.platform->compatible;
+          while (*compat) {
+-            if (strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
+-                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0 ) {
++            if (strncmp(*compat, "brcm,bcm2712-vc6", 16) == 0 ||
++                strncmp(*compat, "brcm,bcm2711-vc5", 16) == 0 ||
++                strncmp(*compat, "brcm,bcm2835-vc4", 16) == 0) {
+                vc4_idx = i;
+                break;
+             }
+@@ -1334,6 +1336,8 @@ v3dv_physical_device_device_id(struct v3dv_physical_device *dev)
+    switch (dev->devinfo.ver) {
+    case 42:
+       return 0xBE485FD3; /* Broadcom deviceID for 2711 */
++   case 71:
++      return 0x55701C33; /* Broadcom deviceID for 2712 */
+    default:
+       unreachable("Unsupported V3D version");
+    }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch
new file mode 100644
index 0000000000..70419bad10
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0076-v3dv-cmd_buffer-emit-TILE_BINNING_MODE_CFG-for-v71.patch
@@ -0,0 +1,32 @@
+From c4f957af4fb0e10abf0a7ffad4f7a468633b7d99 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 20 Jul 2021 14:00:44 +0200
+Subject: [PATCH 076/142] v3dv/cmd_buffer: emit TILE_BINNING_MODE_CFG for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index b958e634c82..17b2f46850d 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -94,7 +94,14 @@ v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("HW generation 71 not supported yet.");
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideally we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
+ #endif
+    }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch
new file mode 100644
index 0000000000..7a6e9ec2a1
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0077-v3dv-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch
@@ -0,0 +1,53 @@
+From 1934ac07df73cb685f6550b8b0f5b4f2ead11396 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 20 Jul 2021 14:33:00 +0200
+Subject: [PATCH 077/142] v3dv: emit TILE_RENDERING_MODE_CFG_COMMON for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c  | 9 ++++++++-
+ src/broadcom/vulkan/v3dvx_meta_common.c | 9 ++++++++-
+ 2 files changed, 16 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 17b2f46850d..7837b460051 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -850,7 +850,14 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("HW generation 71 not supported yet.");
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideallly we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
+ #endif
+ 
+       if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index 2db07ea7427..e4084d851fc 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -62,7 +62,14 @@ emit_rcl_prologue(struct v3dv_job *job,
+       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("Hardware generation 71 not supported yet.");
++      config.log2_tile_width = log2_tile_size(tiling->tile_width);
++      config.log2_tile_height = log2_tile_size(tiling->tile_height);
++      /* FIXME: ideallly we would like next assert on the packet header (as is
++       * general, so also applies to GL). We would need to expand
++       * gen_pack_header for that.
++       */
++      assert(config.log2_tile_width == config.log2_tile_height ||
++             config.log2_tile_width == config.log2_tile_height + 1);
+ #endif
+       config.internal_depth_type = fb->internal_depth_type;
+    }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch b/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch
new file mode 100644
index 0000000000..9c0a0a5ced
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0078-v3dv-cmd_buffer-emit-TILE_RENDERING_MODE_CFG_RENDER_.patch
@@ -0,0 +1,315 @@
+From f0f9eea3cad83ed8824c6a7686150327407a5286 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 22 Jul 2021 14:26:13 +0200
+Subject: [PATCH 078/142] v3dv/cmd_buffer: emit
+ TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1 for v71
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: Alejandro Piñeiro <apinheiro@igalia.com>
+Signed-off-by: Iago Toral Quiroga <itoral@igalia.com>
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c  | 186 +++++++++++++++++-------
+ src/broadcom/vulkan/v3dvx_meta_common.c |  12 +-
+ src/broadcom/vulkan/v3dvx_private.h     |  11 +-
+ 3 files changed, 147 insertions(+), 62 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 7837b460051..c6307890da5 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -800,6 +800,103 @@ set_rcl_early_z_config(struct v3dv_job *job,
+    }
+ }
+ 
++/* Note that for v71, render target cfg packets has just one field that
++ * combined the internal type and clamp mode. For simplicity we keep just one
++ * helper.
++ *
++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
++ *
++ * FIXME: for v71 we are not returning all the possible combinations for
++ * render target internal type and clamp. For example for int types we are
++ * always using clamp int, and for 16f we are using clamp none or pos (that
++ * seem the equivalent for no-clamp on 4.2), but not pq or hlg. In summary
++ * right now we are just porting what we were doing on 4.2
++ */
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format)
++{
++#if V3D_VERSION == 42
++   if (vk_format_is_int(vk_format))
++      return V3D_RENDER_TARGET_CLAMP_INT;
++   else if (vk_format_is_srgb(vk_format))
++      return V3D_RENDER_TARGET_CLAMP_NORM;
++   else
++      return V3D_RENDER_TARGET_CLAMP_NONE;
++#endif
++#if V3D_VERSION >= 71
++   switch (rt_type) {
++   case V3D_INTERNAL_TYPE_8I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
++   case V3D_INTERNAL_TYPE_8UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_8:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_8;
++   case V3D_INTERNAL_TYPE_16I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
++   case V3D_INTERNAL_TYPE_16UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_16F:
++      return vk_format_is_srgb(vk_format) ?
++         V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
++         V3D_RENDER_TARGET_TYPE_CLAMP_16F;
++   case V3D_INTERNAL_TYPE_32I:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
++   case V3D_INTERNAL_TYPE_32UI:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
++   case V3D_INTERNAL_TYPE_32F:
++      return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
++   default:
++      unreachable("Unknown internal render target type");
++   }
++
++   return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
++#endif
++}
++
++static void
++cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
++                                           int rt,
++                                           uint32_t *rt_bpp,
++#if V3D_VERSION == 42
++                                           uint32_t *rt_type,
++                                           uint32_t *rt_clamp)
++#else
++                                           uint32_t *rt_type_clamp)
++#endif
++{
++   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
++
++   assert(state->subpass_idx < state->pass->subpass_count);
++   const struct v3dv_subpass *subpass =
++      &state->pass->subpasses[state->subpass_idx];
++
++   if (rt >= subpass->color_count)
++      return;
++
++   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
++   const uint32_t attachment_idx = attachment->attachment;
++   if (attachment_idx == VK_ATTACHMENT_UNUSED)
++      return;
++
++   assert(attachment_idx < state->framebuffer->attachment_count &&
++          attachment_idx < state->attachment_alloc_count);
++   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
++   assert(vk_format_is_color(iview->vk.format));
++
++   assert(iview->plane_count == 1);
++   *rt_bpp = iview->planes[0].internal_bpp;
++#if V3D_VERSION == 42
++   *rt_type = iview->planes[0].internal_type;
++   *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
++                                               iview->vk.format);
++#endif
++#if V3D_VERSION >= 71
++   *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
++                                                    iview->vk.format);
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+@@ -939,10 +1036,20 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+     */
+    job->early_zs_clear = do_early_zs_clear;
+ 
++#if V3D_VERSION >= 71
++   uint32_t base_addr = 0;
++#endif
+    for (uint32_t i = 0; i < subpass->color_count; i++) {
+       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+-      if (attachment_idx == VK_ATTACHMENT_UNUSED)
++      if (attachment_idx == VK_ATTACHMENT_UNUSED) {
++#if V3D_VERSION >= 71
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++            rt.render_target_number = i;
++            rt.stride = 1; /* Unused */
++         }
++#endif
+          continue;
++      }
+ 
+       struct v3dv_image_view *iview =
+          state->attachments[attachment_idx].image_view;
+@@ -978,9 +1085,6 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          clear.render_target_number = i;
+       };
+ #endif
+-#if V3D_VERSION >= 71
+-         unreachable("HW generation 71 not supported yet.");
+-#endif
+ 
+       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+ #if V3D_VERSION == 42
+@@ -1010,27 +1114,44 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          unreachable("HW generation 71 not supported yet.");
+ #endif
+       }
++
++#if V3D_VERSION >= 71
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++         rt.clear_color_low_bits = clear_color[0];
++         cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
++                                                    &rt.internal_type_and_clamping);
++         rt.stride =
++            v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
++                                                   v3d_internal_bpp_words(rt.internal_bpp));
++         rt.base_address = base_addr;
++         rt.render_target_number = i;
++
++         /* base_addr in multiples of 512 bits. We divide by 8 because stride
++          * is in 128-bit units, but it is packing 2 rows worth of data, so we
++          * need to divide it by 2 so it is only 1 row, and then again by 4 so
++          * it is in 512-bit units.
++          */
++         base_addr += (tiling->tile_height * rt.stride) / 8;
++      }
++#endif
+    }
+ 
+ #if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
+           &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
+           &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
+           &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
+-      v3dX(cmd_buffer_render_pass_setup_render_target)
++      cmd_buffer_render_pass_setup_render_target
+          (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
+           &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
+    }
+ #endif
+-#if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
+-#endif
+ 
+    /* Ends rendering mode config. */
+    if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
+@@ -2445,46 +2566,3 @@ v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
+                                      buffer->mem_offset + offset);
+    }
+ }
+-
+-void
+-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+-                                                 int rt,
+-                                                 uint32_t *rt_bpp,
+-                                                 uint32_t *rt_type,
+-                                                 uint32_t *rt_clamp)
+-{
+-   const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
+-
+-   assert(state->subpass_idx < state->pass->subpass_count);
+-   const struct v3dv_subpass *subpass =
+-      &state->pass->subpasses[state->subpass_idx];
+-
+-   if (rt >= subpass->color_count)
+-      return;
+-
+-   struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
+-   const uint32_t attachment_idx = attachment->attachment;
+-   if (attachment_idx == VK_ATTACHMENT_UNUSED)
+-      return;
+-
+-   assert(attachment_idx < state->framebuffer->attachment_count &&
+-          attachment_idx < state->attachment_alloc_count);
+-   struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
+-   assert(vk_format_is_color(iview->vk.format));
+-
+-   assert(iview->plane_count == 1);
+-   *rt_bpp = iview->planes[0].internal_bpp;
+-   if (vk_format_is_int(iview->vk.view_format))
+-#if V3D_VERSION == 42
+-   *rt_type = iview->planes[0].internal_type;
+-   if (vk_format_is_int(iview->vk.format))
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+-   else if (vk_format_is_srgb(iview->vk.view_format))
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+-   else
+-      *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+-#endif
+-#if V3D_VERSION >= 71
+-   unreachable("HW generation 71 not supported yet.");
+-#endif
+-}
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index e4084d851fc..c6391bc6d83 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -26,6 +26,7 @@
+ 
+ #include "broadcom/common/v3d_macros.h"
+ #include "broadcom/common/v3d_tfu.h"
++#include "broadcom/common/v3d_util.h"
+ #include "broadcom/cle/v3dx_pack.h"
+ #include "broadcom/compiler/v3d_compiler.h"
+ 
+@@ -150,7 +151,16 @@ emit_rcl_prologue(struct v3dv_job *job,
+    }
+ #endif
+ #if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
++   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++      rt.internal_bpp = tiling->internal_bpp;
++      rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
++                                                                      fb->vk_format);
++      rt.stride =
++         v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
++                                                v3d_internal_bpp_words(rt.internal_bpp));
++      rt.base_address = 0;
++      rt.render_target_number = 0;
++   }
+ #endif
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index ad8ddfa5731..a4157d11c7c 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -125,13 +125,6 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
+                          uint32_t internal_size,
+                          uint32_t *hw_color);
+ 
+-void
+-v3dX(cmd_buffer_render_pass_setup_render_target)(struct v3dv_cmd_buffer *cmd_buffer,
+-                                                 int rt,
+-                                                 uint32_t *rt_bpp,
+-                                                 uint32_t *rt_type,
+-                                                 uint32_t *rt_clamp);
+-
+ /* Used at v3dv_device */
+ 
+ void
+@@ -325,3 +318,7 @@ uint32_t v3dX(max_descriptor_bo_size)(void);
+ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
+ 
+ uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
++
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch
new file mode 100644
index 0000000000..ee9e9d2074
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0079-v3dvx-cmd_buffer-emit-CLEAR_RENDER_TARGETS-for-v71.patch
@@ -0,0 +1,25 @@
+From 7c89d8026fd550282d54933f37ffc2773869326f Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Mon, 26 Jul 2021 15:08:11 +0200
+Subject: [PATCH 079/142] v3dvx/cmd_buffer: emit CLEAR_RENDER_TARGETS for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index c6307890da5..ae1c21ae00b 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1219,7 +1219,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          }
+ #endif
+ #if V3D_VERSION >= 71
+-         unreachable("HW generation 71 not supported yet.");
++         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
+ #endif
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch
new file mode 100644
index 0000000000..a6507e3a17
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0080-v3dv-cmd_buffer-emit-CLIPPER_XY_SCALING-for-v71.patch
@@ -0,0 +1,38 @@
+From 2eb29b57fde2acda76e12953b3a1050f3056b39d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Sun, 19 Sep 2021 23:37:32 +0200
+Subject: [PATCH 080/142] v3dv/cmd_buffer: emit CLIPPER_XY_SCALING for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index ae1c21ae00b..2e525a11619 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1246,9 +1246,7 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+     * now, would need to change if we allow multiple viewports
+     */
+    float *vptranslate = dynamic->viewport.translate[0];
+-#if V3D_VERSION == 42
+    float *vpscale = dynamic->viewport.scale[0];
+-#endif
+ 
+    struct v3dv_job *job = cmd_buffer->state.job;
+    assert(job);
+@@ -1268,7 +1266,10 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+    }
+ #endif
+ #if V3D_VERSION >= 71
+-   unreachable("HW generation 71 not supported yet.");
++   cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
++      clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
++      clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
++   }
+ #endif
+ 
+    float translate_z, scale_z;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch b/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch
new file mode 100644
index 0000000000..cb0d7512d3
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0081-v3dv-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for.patch
@@ -0,0 +1,97 @@
+From 611bf6a7445837c7e20416ff9f11a6dad9c543d7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 14 Sep 2021 10:08:19 +0200
+Subject: [PATCH 081/142] v3dv/uniforms: update VIEWPORT_X/Y_SCALE uniforms for
+ v71
+
+As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths
+of pixel, instead of 1/256ths of pixels.
+
+As this is the usual values that we get from macros, we add manually a
+v42 and v71 macro, and define a new helper (V3DV_X) to get the value
+for the current hw version.
+---
+ src/broadcom/vulkan/v3dv_private.h  | 17 +++++++++++++++++
+ src/broadcom/vulkan/v3dv_uniforms.c |  7 ++++---
+ src/broadcom/vulkan/v3dvx_private.h |  9 +++++++++
+ 3 files changed, 30 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index 6bdf338c67b..cd6811b19c2 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -2617,6 +2617,23 @@ u64_compare(const void *key1, const void *key2)
+    v3d_X_thing;                                       \
+ })
+ 
++/* Helper to get hw-specific macro values */
++#define V3DV_X(device, thing) ({                                \
++   __typeof(V3D42_##thing) V3D_X_THING;                         \
++   switch (device->devinfo.ver) {                               \
++   case 42:                                                     \
++      V3D_X_THING = V3D42_##thing;                              \
++      break;                                                    \
++   case 71:                                                     \
++      V3D_X_THING = V3D71_##thing;                              \
++      break;                                                    \
++   default:                                                     \
++      unreachable("Unsupported hardware generation");           \
++   }                                                            \
++   V3D_X_THING;                                                 \
++})
++
++
+ 
+ /* v3d_macros from common requires v3dX and V3DX definitions. Below we need to
+  * define v3dX for each version supported, because when we compile code that
+diff --git a/src/broadcom/vulkan/v3dv_uniforms.c b/src/broadcom/vulkan/v3dv_uniforms.c
+index 72fa9a1b39c..0e681cc4ee2 100644
+--- a/src/broadcom/vulkan/v3dv_uniforms.c
++++ b/src/broadcom/vulkan/v3dv_uniforms.c
+@@ -497,7 +497,8 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
+    struct v3dv_cl_reloc uniform_stream = v3dv_cl_get_address(&job->indirect);
+ 
+    struct v3dv_cl_out *uniforms = cl_start(&job->indirect);
+-
++   float clipper_xy_granularity =
++      V3DV_X(cmd_buffer->device, CLIPPER_XY_GRANULARITY);
+    for (int i = 0; i < uinfo->count; i++) {
+       uint32_t data = uinfo->data[i];
+ 
+@@ -520,11 +521,11 @@ v3dv_write_uniforms_wg_offsets(struct v3dv_cmd_buffer *cmd_buffer,
+          break;
+ 
+       case QUNIFORM_VIEWPORT_X_SCALE:
+-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * 256.0f);
++         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][0] * clipper_xy_granularity);
+          break;
+ 
+       case QUNIFORM_VIEWPORT_Y_SCALE:
+-         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * 256.0f);
++         cl_aligned_f(&uniforms, dynamic->viewport.scale[0][1] * clipper_xy_granularity);
+          break;
+ 
+       case QUNIFORM_VIEWPORT_Z_OFFSET: {
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index a4157d11c7c..ff9ba75cf93 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -319,6 +319,15 @@ uint32_t v3dX(combined_image_sampler_texture_state_offset)(uint8_t plane);
+ 
+ uint32_t v3dX(combined_image_sampler_sampler_state_offset)(uint8_t plane);
+ 
++/* General utils */
++
++uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                VkFormat vk_format);
++
++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
++
+ uint32_t
+ v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                 VkFormat vk_format);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch b/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch
new file mode 100644
index 0000000000..8a77ae3708
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0082-v3dv-cmd_buffer-just-don-t-fill-up-early-z-fields-fo.patch
@@ -0,0 +1,40 @@
+From 3819efaf2bb6fd8bd9cd45d54fb7254377b2296a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 27 Jul 2021 14:02:30 +0200
+Subject: [PATCH 082/142] v3dv/cmd_buffer: just don't fill up early-z fields
+ for CFG_BITS for v71
+
+For v71 early_z_enable/early_z_updates_enable is configured with
+packet 121.
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 9 +++------
+ 1 file changed, 3 insertions(+), 6 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 2e525a11619..fe9f7e43596 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1783,17 +1783,14 @@ v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
+    v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
+    v3dv_return_if_oom(cmd_buffer, NULL);
+ 
+-#if V3D_VERSION == 42
+-   bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+    cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
++#if V3D_VERSION == 42
++      bool enable_ez = job_update_ez_state(job, pipeline, cmd_buffer);
+       config.early_z_enable = enable_ez;
+       config.early_z_updates_enable = config.early_z_enable &&
+          pipeline->z_updates_enable;
+-   }
+-#endif
+-#if V3D_VERSION >= 71
+-   unreachable("HW generation 71 not supported yet.");
+ #endif
++   }
+ }
+ 
+ void
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch b/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch
new file mode 100644
index 0000000000..b37e2be950
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0083-v3dv-default-vertex-attribute-values-are-gen-dependa.patch
@@ -0,0 +1,219 @@
+From e3b1a578f45ea830d790970115b6de978d56edb8 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 28 Jul 2021 12:01:38 +0200
+Subject: [PATCH 083/142] v3dv: default vertex attribute values are gen
+ dependant
+
+Content, structure and size would depend on the generation. Even if it
+is needed at all.
+
+So let's move it to the v3dvx files.
+---
+ src/broadcom/vulkan/v3dv_device.c    |  2 +-
+ src/broadcom/vulkan/v3dv_pipeline.c  | 61 ++-------------------------
+ src/broadcom/vulkan/v3dv_private.h   |  4 --
+ src/broadcom/vulkan/v3dvx_pipeline.c | 63 ++++++++++++++++++++++++++++
+ src/broadcom/vulkan/v3dvx_private.h  |  8 ++++
+ 5 files changed, 75 insertions(+), 63 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index c8719d33f15..01e2dd7ac2d 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -2043,7 +2043,7 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice,
+    v3dv_pipeline_cache_init(&device->default_pipeline_cache, device, 0,
+                             device->instance->default_pipeline_cache_enabled);
+    device->default_attribute_float =
+-      v3dv_pipeline_create_default_attribute_values(device, NULL);
++      v3dv_X(device, create_default_attribute_values)(device, NULL);
+ 
+    device->device_address_mem_ctx = ralloc_context(NULL);
+    util_dynarray_init(&device->device_address_bo_list,
+diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
+index 22f01bdf64b..d012ff8f948 100644
+--- a/src/broadcom/vulkan/v3dv_pipeline.c
++++ b/src/broadcom/vulkan/v3dv_pipeline.c
+@@ -2802,62 +2802,6 @@ pipeline_set_ez_state(struct v3dv_pipeline *pipeline,
+    }
+ }
+ 
+-static bool
+-pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+-{
+-   for (uint8_t i = 0; i < pipeline->va_count; i++) {
+-      if (vk_format_is_int(pipeline->va[i].vk_format))
+-         return true;
+-   }
+-   return false;
+-}
+-
+-/* @pipeline can be NULL. We assume in that case that all the attributes have
+- * a float format (we only create an all-float BO once and we reuse it with
+- * all float pipelines), otherwise we look at the actual type of each
+- * attribute used with the specific pipeline passed in.
+- */
+-struct v3dv_bo *
+-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
+-                                              struct v3dv_pipeline *pipeline)
+-{
+-   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+-   struct v3dv_bo *bo;
+-
+-   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
+-
+-   if (!bo) {
+-      fprintf(stderr, "failed to allocate memory for the default "
+-              "attribute values\n");
+-      return NULL;
+-   }
+-
+-   bool ok = v3dv_bo_map(device, bo, size);
+-   if (!ok) {
+-      fprintf(stderr, "failed to map default attribute values buffer\n");
+-      return false;
+-   }
+-
+-   uint32_t *attrs = bo->map;
+-   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
+-   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
+-      attrs[i * 4 + 0] = 0;
+-      attrs[i * 4 + 1] = 0;
+-      attrs[i * 4 + 2] = 0;
+-      VkFormat attr_format =
+-         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
+-      if (i < va_count && vk_format_is_int(attr_format)) {
+-         attrs[i * 4 + 3] = 1;
+-      } else {
+-         attrs[i * 4 + 3] = fui(1.0);
+-      }
+-   }
+-
+-   v3dv_bo_unmap(device, bo);
+-
+-   return bo;
+-}
+-
+ static void
+ pipeline_set_sample_mask(struct v3dv_pipeline *pipeline,
+                          const VkPipelineMultisampleStateCreateInfo *ms_info)
+@@ -2992,9 +2936,10 @@ pipeline_init(struct v3dv_pipeline *pipeline,
+ 
+    v3dv_X(device, pipeline_pack_compile_state)(pipeline, vi_info, vd_info);
+ 
+-   if (pipeline_has_integer_vertex_attrib(pipeline)) {
++   if (v3dv_X(device, pipeline_needs_default_attribute_values)(pipeline)) {
+       pipeline->default_attribute_values =
+-         v3dv_pipeline_create_default_attribute_values(pipeline->device, pipeline);
++         v3dv_X(pipeline->device, create_default_attribute_values)(pipeline->device, pipeline);
++
+       if (!pipeline->default_attribute_values)
+          return VK_ERROR_OUT_OF_DEVICE_MEMORY;
+    } else {
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index cd6811b19c2..a9fab24d19e 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -2500,10 +2500,6 @@ void
+ v3dv_pipeline_cache_upload_pipeline(struct v3dv_pipeline *pipeline,
+                                     struct v3dv_pipeline_cache *cache);
+ 
+-struct v3dv_bo *
+-v3dv_pipeline_create_default_attribute_values(struct v3dv_device *device,
+-                                              struct v3dv_pipeline *pipeline);
+-
+ VkResult
+ v3dv_create_compute_pipeline_from_nir(struct v3dv_device *device,
+                                       nir_shader *nir,
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 922698b08a2..e235220cb14 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -664,3 +664,66 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+       }
+    }
+ }
++
++static bool
++pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
++{
++   for (uint8_t i = 0; i < pipeline->va_count; i++) {
++      if (vk_format_is_int(pipeline->va[i].vk_format))
++         return true;
++   }
++   return false;
++}
++
++bool
++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
++{
++   return pipeline_has_integer_vertex_attrib(pipeline);
++}
++
++/* @pipeline can be NULL. In that case we assume the most common case. For
++ * example, for v42 we assume in that case that all the attributes have a
++ * float format (we only create an all-float BO once and we reuse it with all
++ * float pipelines), otherwise we look at the actual type of each attribute
++ * used with the specific pipeline passed in.
++ */
++struct v3dv_bo *
++v3dX(create_default_attribute_values)(struct v3dv_device *device,
++                                      struct v3dv_pipeline *pipeline)
++{
++   uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
++   struct v3dv_bo *bo;
++
++   bo = v3dv_bo_alloc(device, size, "default_vi_attributes", true);
++
++   if (!bo) {
++      fprintf(stderr, "failed to allocate memory for the default "
++              "attribute values\n");
++      return NULL;
++   }
++
++   bool ok = v3dv_bo_map(device, bo, size);
++   if (!ok) {
++      fprintf(stderr, "failed to map default attribute values buffer\n");
++      return NULL;
++   }
++
++   uint32_t *attrs = bo->map;
++   uint8_t va_count = pipeline != NULL ? pipeline->va_count : 0;
++   for (int i = 0; i < MAX_VERTEX_ATTRIBS; i++) {
++      attrs[i * 4 + 0] = 0;
++      attrs[i * 4 + 1] = 0;
++      attrs[i * 4 + 2] = 0;
++      VkFormat attr_format =
++         pipeline != NULL ? pipeline->va[i].vk_format : VK_FORMAT_UNDEFINED;
++      if (i < va_count && vk_format_is_int(attr_format)) {
++         attrs[i * 4 + 3] = 1;
++      } else {
++         attrs[i * 4 + 3] = fui(1.0);
++      }
++   }
++
++   v3dv_bo_unmap(device, bo);
++
++   return bo;
++}
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index ff9ba75cf93..036ce11b455 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -306,6 +306,14 @@ void
+ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+                                   const VkPipelineVertexInputStateCreateInfo *vi_info,
+                                   const VkPipelineVertexInputDivisorStateCreateInfoEXT *vd_info);
++
++bool
++v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline);
++
++struct v3dv_bo *
++v3dX(create_default_attribute_values)(struct v3dv_device *device,
++                                      struct v3dv_pipeline *pipeline);
++
+ /* Used at v3dv_queue */
+ void
+ v3dX(job_emit_noop)(struct v3dv_job *job);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch b/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch
new file mode 100644
index 0000000000..f33f20827d
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0084-v3dv-pipeline-default-vertex-attributes-values-are-n.patch
@@ -0,0 +1,87 @@
+From 8464dc8869f3d2eccfecac7b4358cc0ffe05f081 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 28 Jul 2021 12:05:26 +0200
+Subject: [PATCH 084/142] v3dv/pipeline: default vertex attributes values are
+ not needed for v71
+
+There are not part of the shader state record.
+---
+ src/broadcom/vulkan/v3dv_private.h   | 10 +++++++++-
+ src/broadcom/vulkan/v3dvx_pipeline.c | 10 ++++++++++
+ 2 files changed, 19 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index a9fab24d19e..300a1ec8ae1 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -581,6 +581,10 @@ struct v3dv_device {
+     * being float being float, allowing us to reuse the same BO for all
+     * pipelines matching this requirement. Pipelines that need integer
+     * attributes will create their own BO.
++    *
++    * Note that since v71 the default attribute values are not needed, so this
++    * can be NULL.
++    *
+     */
+    struct v3dv_bo *default_attribute_float;
+ 
+@@ -2289,11 +2293,15 @@ struct v3dv_pipeline {
+    unsigned char sha1[20];
+ 
+    /* In general we can reuse v3dv_device->default_attribute_float, so note
+-    * that the following can be NULL.
++    * that the following can be NULL. In 7.x this is not used, so it will be
++    * NULL.
+     *
+     * FIXME: the content of this BO will be small, so it could be improved to
+     * be uploaded to a common BO. But as in most cases it will be NULL, it is
+     * not a priority.
++    *
++    * Note that since v71 the default attribute values are not needed, so this
++    * can be NULL.
+     */
+    struct v3dv_bo *default_attribute_values;
+ 
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index e235220cb14..4dc6d70efe1 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -665,6 +665,7 @@ v3dX(pipeline_pack_compile_state)(struct v3dv_pipeline *pipeline,
+    }
+ }
+ 
++#if V3D_VERSION == 42
+ static bool
+ pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+ {
+@@ -674,11 +675,16 @@ pipeline_has_integer_vertex_attrib(struct v3dv_pipeline *pipeline)
+    }
+    return false;
+ }
++#endif
+ 
+ bool
+ v3dX(pipeline_needs_default_attribute_values)(struct v3dv_pipeline *pipeline)
+ {
++#if V3D_VERSION == 42
+    return pipeline_has_integer_vertex_attrib(pipeline);
++#endif
++
++   return false;
+ }
+ 
+ /* @pipeline can be NULL. In that case we assume the most common case. For
+@@ -691,6 +697,10 @@ struct v3dv_bo *
+ v3dX(create_default_attribute_values)(struct v3dv_device *device,
+                                       struct v3dv_pipeline *pipeline)
+ {
++#if V3D_VERSION >= 71
++   return NULL;
++#endif
++
+    uint32_t size = MAX_VERTEX_ATTRIBS * sizeof(float) * 4;
+    struct v3dv_bo *bo;
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch
new file mode 100644
index 0000000000..0d8acd9826
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0085-v3dv-pipeline-handle-GL_SHADER_STATE_RECORD-changed-.patch
@@ -0,0 +1,39 @@
+From 339096598660ec34be8087007dd4d66581de1c4e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 28 Jul 2021 13:45:52 +0200
+Subject: [PATCH 085/142] v3dv/pipeline: handle GL_SHADER_STATE_RECORD changed
+ size on v71
+
+It is likely that we would need more changes, as this packet changed,
+but this is enough to get basic tests running. Any additional support
+will be handled with new commits.
+---
+ src/broadcom/vulkan/v3dvx_pipeline.c | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 4dc6d70efe1..a640c1d084a 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -360,7 +360,7 @@ v3dX(pipeline_pack_state)(struct v3dv_pipeline *pipeline,
+ static void
+ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+ {
+-   assert(sizeof(pipeline->shader_state_record) ==
++   assert(sizeof(pipeline->shader_state_record) >=
+           cl_packet_length(GL_SHADER_STATE_RECORD));
+ 
+    struct v3d_fs_prog_data *prog_data_fs =
+@@ -453,9 +453,6 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs->separate_segments;
+ #endif
+-#if V3D_VERSION >= 71
+-      unreachable("HW generation 71 not supported yet.");
+-#endif
+ 
+       shader.coordinate_shader_input_vpm_segment_size =
+          prog_data_vs_bin->separate_segments ?
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch b/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch
new file mode 100644
index 0000000000..b1d310f166
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0086-v3dv-setup-render-pass-color-clears-for-any-format-b.patch
@@ -0,0 +1,89 @@
+From 5b1342eb1e255d17619b1a7b33eaf7b31f5e50a5 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 22 Sep 2021 12:03:58 +0200
+Subject: [PATCH 086/142] v3dv: setup render pass color clears for any format
+ bpp in v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 33 ++++++++++++++++----------
+ 1 file changed, 20 insertions(+), 13 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index fe9f7e43596..1b39e230580 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1064,7 +1064,7 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+       UNUSED const uint32_t *clear_color =
+          &state->attachments[attachment_idx].clear_value.color[0];
+ 
+-      uint32_t clear_pad = 0;
++      UNUSED uint32_t clear_pad = 0;
+       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
+           slice->tiling == V3D_TILING_UIF_XOR) {
+          int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
+@@ -1084,10 +1084,8 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
+          clear.render_target_number = i;
+       };
+-#endif
+ 
+       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
+-#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+             clear.clear_color_mid_low_32_bits =
+                ((clear_color[1] >> 24) | (clear_color[2] << 8));
+@@ -1095,25 +1093,16 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+                ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
+             clear.render_target_number = i;
+          };
+-#endif
+-#if V3D_VERSION >= 71
+-         unreachable("HW generation 71 not supported yet.");
+-#endif
+-
+       }
+ 
+       if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+-#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+             clear.uif_padded_height_in_uif_blocks = clear_pad;
+             clear.clear_color_high_16_bits = clear_color[3] >> 16;
+             clear.render_target_number = i;
+          };
+-#endif
+-#if V3D_VERSION >= 71
+-         unreachable("HW generation 71 not supported yet.");
+-#endif
+       }
++#endif
+ 
+ #if V3D_VERSION >= 71
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
+@@ -1133,6 +1122,24 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+           */
+          base_addr += (tiling->tile_height * rt.stride) / 8;
+       }
++
++      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++            rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++               ((uint64_t) clear_color[1]) |
++               (((uint64_t) (clear_color[2] & 0xff)) << 32);
++            rt.render_target_number = i;
++         }
++      }
++
++      if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
++         cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++            rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++               (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
++               (((uint64_t) (clear_color[3])) << 24);
++            rt.render_target_number = i;
++         }
++      }
+ #endif
+    }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch
new file mode 100644
index 0000000000..26e8475540
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0087-v3dv-setup-TLB-clear-color-for-meta-operations-in-v7.patch
@@ -0,0 +1,126 @@
+From ff5b5d4405b1d5600d7f1c4355202fd303f56700 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 22 Sep 2021 12:04:21 +0200
+Subject: [PATCH 087/142] v3dv: setup TLB clear color for meta operations in
+ v71
+
+---
+ src/broadcom/vulkan/v3dvx_meta_common.c | 46 +++++++++++++++----------
+ 1 file changed, 27 insertions(+), 19 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index c6391bc6d83..09ebcfa97c1 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -75,8 +75,9 @@ emit_rcl_prologue(struct v3dv_job *job,
+       config.internal_depth_type = fb->internal_depth_type;
+    }
+ 
++   const uint32_t *color = NULL;
+    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
+-      uint32_t clear_pad = 0;
++      UNUSED uint32_t clear_pad = 0;
+       if (clear_info->image) {
+          const struct v3dv_image *image = clear_info->image;
+ 
+@@ -101,20 +102,16 @@ emit_rcl_prologue(struct v3dv_job *job,
+          }
+       }
+ 
++      color = &clear_info->clear_value->color[0];
++
+ #if V3D_VERSION == 42
+-      const uint32_t *color = &clear_info->clear_value->color[0];
+       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
+          clear.clear_color_low_32_bits = color[0];
+          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
+          clear.render_target_number = 0;
+       };
+-#endif
+-#if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
+-#endif
+ 
+       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
+-#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
+             clear.clear_color_mid_low_32_bits =
+               ((color[1] >> 24) | (color[2] << 8));
+@@ -122,25 +119,16 @@ emit_rcl_prologue(struct v3dv_job *job,
+               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
+             clear.render_target_number = 0;
+          };
+-#endif
+-#if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
+-#endif
+-
+       }
+ 
+       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
+-#if V3D_VERSION == 42
+          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
+             clear.uif_padded_height_in_uif_blocks = clear_pad;
+             clear.clear_color_high_16_bits = color[3] >> 16;
+             clear.render_target_number = 0;
+          };
+-#endif
+-#if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
+-#endif
+       }
++#endif
+    }
+ 
+ #if V3D_VERSION == 42
+@@ -150,8 +138,11 @@ emit_rcl_prologue(struct v3dv_job *job,
+       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+    }
+ #endif
++
+ #if V3D_VERSION >= 71
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++      if (color)
++         rt.clear_color_low_bits = color[0];
+       rt.internal_bpp = tiling->internal_bpp;
+       rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
+                                                                       fb->vk_format);
+@@ -161,6 +152,24 @@ emit_rcl_prologue(struct v3dv_job *job,
+       rt.base_address = 0;
+       rt.render_target_number = 0;
+    }
++
++   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++         rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++            ((uint64_t) color[1]) |
++            (((uint64_t) (color[2] & 0xff)) << 32);
++         rt.render_target_number = 0;
++      }
++   }
++
++   if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++         rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++            (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
++            (((uint64_t) (color[3])) << 24);
++         rt.render_target_number = 0;
++      }
++   }
+ #endif
+ 
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+@@ -229,9 +238,8 @@ emit_frame_setup(struct v3dv_job *job,
+          }
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("Hardware generation 71 not supported yet.");
++         cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
+ #endif
+-
+       }
+       cl_emit(rcl, END_OF_TILE_MARKER, end);
+    }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch
new file mode 100644
index 0000000000..2bf2de50b7
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0088-v3dv-fix-up-texture-shader-state-for-v71.patch
@@ -0,0 +1,49 @@
+From 1e9d7d69849fa646b331f7661c74ee138badc4bb Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 25 Oct 2021 01:37:12 +0200
+Subject: [PATCH 088/142] v3dv: fix up texture shader state for v71
+
+There are some new fields for YCbCr with pointers for the various
+planes in multi-planar formats. These need to match the base address
+pointer in the texture state, or the hardware will assume this is a
+multi-planar texture.
+---
+ src/broadcom/vulkan/v3dvx_image.c | 16 ++++++++++++++++
+ 1 file changed, 16 insertions(+)
+
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index dac6ff2741f..848290c2a47 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -129,6 +129,14 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+             v3dv_layer_offset(image, 0, image_view->vk.base_array_layer,
+                               iplane);
+          tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
++
++#if V3D_VERSION >= 71
++         tex.chroma_offset_x = 1;
++         tex.chroma_offset_y = 1;
++         /* See comment in XML field definition for rationale of the shifts */
++         tex.texture_base_pointer_cb = base_offset >> 6;
++         tex.texture_base_pointer_cr = base_offset >> 6;
++#endif
+       }
+    }
+ }
+@@ -191,5 +199,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+          buffer_view->offset;
+ 
+       tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
++
++#if V3D_VERSION >= 71
++      tex.chroma_offset_x = 1;
++      tex.chroma_offset_y = 1;
++      /* See comment in XML field definition for rationale of the shifts */
++      tex.texture_base_pointer_cb = base_offset >> 6;
++      tex.texture_base_pointer_cr = base_offset >> 6;
++#endif
+    }
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch b/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch
new file mode 100644
index 0000000000..7647e30707
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0089-v3dv-handle-new-texture-state-transfer-functions-in-.patch
@@ -0,0 +1,52 @@
+From 1f150a3a92741f7654a13626bd5b27b5575f2b76 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Mon, 25 Oct 2021 01:38:31 +0200
+Subject: [PATCH 089/142] v3dv: handle new texture state transfer functions in
+ v71
+
+---
+ src/broadcom/vulkan/v3dvx_image.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index 848290c2a47..437d4588c7e 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -108,15 +108,16 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+ 
+          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+ 
++         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+ #if V3D_VERSION == 42
+          tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
+ #endif
+ 
+ #if V3D_VERSION == 42
+-         tex.srgb = vk_format_is_srgb(image_view->vk.view_format);
++         tex.srgb = is_srgb;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("Hardware generation 71 not supported yet.");
++         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+ #endif
+ 
+          /* At this point we don't have the job. That's the reason the first
+@@ -181,11 +182,13 @@ v3dX(pack_texture_shader_state_from_buffer_view)(struct v3dv_device *device,
+ 
+       assert(buffer_view->format->plane_count == 1);
+       tex.texture_type = buffer_view->format->planes[0].tex_type;
++
++      bool is_srgb = vk_format_is_srgb(buffer_view->vk_format);
+ #if V3D_VERSION == 42
+-      tex.srgb = vk_format_is_srgb(buffer_view->vk_format);
++      tex.srgb = is_srgb;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("Hardware generation 71 not supported yet.");
++      tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+ #endif
+ 
+       /* At this point we don't have the job. That's the reason the first
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch
new file mode 100644
index 0000000000..69401c2100
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0090-v3dv-implement-noop-job-for-v71.patch
@@ -0,0 +1,42 @@
+From 45de9f019ee92635de9a505db58439f0f4561281 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 08:14:11 +0200
+Subject: [PATCH 090/142] v3dv: implement noop job for v71
+
+---
+ src/broadcom/vulkan/v3dvx_queue.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
+index 1a26d04aef7..f8cee36e3bf 100644
+--- a/src/broadcom/vulkan/v3dvx_queue.c
++++ b/src/broadcom/vulkan/v3dvx_queue.c
+@@ -46,7 +46,8 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
+       config.maximum_bpp_of_all_render_targets = V3D_INTERNAL_BPP_32;
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("HW generation 71 not supported yet.");
++      config.log2_tile_width = 3; /* Tile size 64 */
++      config.log2_tile_height = 3; /* Tile size 64 */
+ #endif
+    }
+ 
+@@ -58,10 +59,13 @@ v3dX(job_emit_noop)(struct v3dv_job *job)
+    }
+ #endif
+ #if V3D_VERSION >= 71
+-   unreachable("Hardware generation 71 not supported yet.");
++   cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++      rt.internal_bpp = V3D_INTERNAL_BPP_32;
++      rt.internal_type_and_clamping = V3D_RENDER_TARGET_TYPE_CLAMP_8;
++      rt.stride = 1; /* Unused RT */
++   }
+ #endif
+ 
+-
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
+       clear.z_clear_value = 1.0f;
+       clear.stencil_clear_value = 0;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch
new file mode 100644
index 0000000000..066e45d424
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0091-v3dv-handle-render-pass-global-clear-for-v71.patch
@@ -0,0 +1,117 @@
+From 3e607bb28056bb52242be6878281efae84026813 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 08:23:48 +0200
+Subject: [PATCH 091/142] v3dv: handle render pass global clear for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 66 ++++++++++++++++----------
+ 1 file changed, 41 insertions(+), 25 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 1b39e230580..48b2e319e51 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -362,6 +362,11 @@ cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
+                                              iview->vk.base_array_layer + layer,
+                                              image_plane);
+ 
++   /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
++    * is broken in earlier V3D versions.
++    */
++   assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
++
+    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
+       store.buffer_to_store = buffer;
+       store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
+@@ -484,6 +489,30 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+       const VkImageAspectFlags aspects =
+          vk_format_aspects(ds_attachment->desc.format);
+ 
++#if V3D_VERSION <= 42
++      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
++       * for depth/stencil.
++       *
++       * There used to be some confusion regarding the Clear Tile Buffers
++       * Z/S bit also being broken, but we confirmed with Broadcom that this
++       * is not the case, it was just that some other hardware bugs (that we
++       * need to work around, such as GFXH-1461) could cause this bit to behave
++       * incorrectly.
++       *
++       * There used to be another issue where the RTs bit in the Clear Tile
++       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
++       * fixed since V3D 4.1.
++       *
++       * So if we have to emit a clear of depth or stencil we don't use
++       * the per-buffer store clear bit, even if we need to store the buffers,
++       * instead we always have to use the Clear Tile Buffers Z/S bit.
++       * If we have configured the job to do early Z/S clearing, then we
++       * don't want to emit any Clear Tile Buffers command at all here.
++       *
++       * Note that GFXH-1689 is not reproduced in the simulator, where
++       * using the clear buffer bit in depth/stencil stores works fine.
++       */
++
+       /* Only clear once on the first subpass that uses the attachment */
+       uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
+          ds_attachment->first_subpass :
+@@ -503,6 +532,17 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+                            ds_attachment->desc.stencilLoadOp,
+                            subpass->do_stencil_clear_with_draw);
+ 
++      use_global_zs_clear = !state->job->early_zs_clear &&
++         (needs_depth_clear || needs_stencil_clear);
++#endif
++#if V3D_VERSION >= 71
++      /* The store command's clear buffer bit cannot be used for Z/S stencil:
++       * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
++       * so we don't want to emit redundant clears here.
++       */
++      use_global_zs_clear = false;
++#endif
++
+       /* Skip the last store if it is not required */
+       uint32_t ds_last_subpass = !pass->multiview_enabled ?
+          ds_attachment->last_subpass :
+@@ -545,30 +585,6 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+          needs_stencil_store = subpass->resolve_stencil;
+       }
+ 
+-      /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
+-       * for depth/stencil.
+-       *
+-       * There used to be some confusion regarding the Clear Tile Buffers
+-       * Z/S bit also being broken, but we confirmed with Broadcom that this
+-       * is not the case, it was just that some other hardware bugs (that we
+-       * need to work around, such as GFXH-1461) could cause this bit to behave
+-       * incorrectly.
+-       *
+-       * There used to be another issue where the RTs bit in the Clear Tile
+-       * Buffers packet also cleared Z/S, but Broadcom confirmed this is
+-       * fixed since V3D 4.1.
+-       *
+-       * So if we have to emit a clear of depth or stencil we don't use
+-       * the per-buffer store clear bit, even if we need to store the buffers,
+-       * instead we always have to use the Clear Tile Buffers Z/S bit.
+-       * If we have configured the job to do early Z/S clearing, then we
+-       * don't want to emit any Clear Tile Buffers command at all here.
+-       *
+-       * Note that GFXH-1689 is not reproduced in the simulator, where
+-       * using the clear buffer bit in depth/stencil stores works fine.
+-       */
+-      use_global_zs_clear = !state->job->early_zs_clear &&
+-         (needs_depth_clear || needs_stencil_clear);
+       if (needs_depth_store || needs_stencil_store) {
+          const uint32_t zs_buffer =
+             v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
+@@ -673,7 +689,7 @@ cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
+       }
+ #endif
+ #if V3D_VERSION >= 71
+-      unreachable("Hardware generation 71 not supported yet.");
++      cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+ #endif
+    }
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch
new file mode 100644
index 0000000000..0251f31b56
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0092-v3dv-GFX-1461-does-not-affect-V3D-7.x.patch
@@ -0,0 +1,32 @@
+From 3794f6f08c559c4e442b57e992d501fb7d515b9b Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 08:31:04 +0200
+Subject: [PATCH 092/142] v3dv: GFX-1461 does not affect V3D 7.x
+
+---
+ src/broadcom/vulkan/v3dv_pass.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
+index 20f5014268d..3e82c15df88 100644
+--- a/src/broadcom/vulkan/v3dv_pass.c
++++ b/src/broadcom/vulkan/v3dv_pass.c
+@@ -236,11 +236,13 @@ v3dv_CreateRenderPass2(VkDevice _device,
+ 
+          /* GFXH-1461: if depth is cleared but stencil is loaded (or vice versa),
+           * the clear might get lost. If a subpass has this then we can't emit
+-          * the clear using the TLB and we have to do it as a draw call.
++          * the clear using the TLB and we have to do it as a draw call. This
++          * issue is fixed since V3D 4.3.18.
+           *
+           * FIXME: separate stencil.
+           */
+-         if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
++         if (device->devinfo.ver == 42 &&
++             subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
+             struct v3dv_render_pass_attachment *att =
+                &pass->attachments[subpass->ds_attachment.attachment];
+             if (att->desc.format == VK_FORMAT_D24_UNORM_S8_UINT) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch
new file mode 100644
index 0000000000..2b9aa1538c
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0093-v3dv-update-thread-end-restrictions-validation-for-v.patch
@@ -0,0 +1,69 @@
+From 5be7f484210103e40b77fa3135042da4a8406659 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 28 Sep 2021 08:59:08 +0200
+Subject: [PATCH 093/142] v3dv: update thread end restrictions validation for
+ v71
+
+---
+ src/broadcom/compiler/qpu_validate.c | 37 +++++++++++++++++++++++++---
+ 1 file changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_validate.c b/src/broadcom/compiler/qpu_validate.c
+index 1082fb7d50a..0466ee5d0b6 100644
+--- a/src/broadcom/compiler/qpu_validate.c
++++ b/src/broadcom/compiler/qpu_validate.c
+@@ -316,17 +316,48 @@ qpu_validate_inst(struct v3d_qpu_validate_state *state, struct qinst *qinst)
+             inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                 if ((inst->alu.add.op != V3D_QPU_A_NOP &&
+                      !inst->alu.add.magic_write)) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71) {
++                                if (state->last_thrsw_ip - state->ip == 0) {
++                                        fail_instr(state,
++                                                   "ADD RF write at THREND");
++                                }
++                                if (inst->alu.add.waddr == 2 ||
++                                    inst->alu.add.waddr == 3) {
++                                        fail_instr(state,
++                                                   "RF2-3 write after THREND");
++                                }
++                        }
+                 }
+ 
+                 if ((inst->alu.mul.op != V3D_QPU_M_NOP &&
+                      !inst->alu.mul.magic_write)) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71) {
++                                if (state->last_thrsw_ip - state->ip == 0) {
++                                        fail_instr(state,
++                                                   "MUL RF write at THREND");
++                                }
++
++                                if (inst->alu.mul.waddr == 2 ||
++                                    inst->alu.mul.waddr == 3) {
++                                        fail_instr(state,
++                                                   "RF2-3 write after THREND");
++                                }
++                        }
+                 }
+ 
+                 if (v3d_qpu_sig_writes_address(devinfo, &inst->sig) &&
+                     !inst->sig_magic) {
+-                        fail_instr(state, "RF write after THREND");
++                        if (devinfo->ver <= 42) {
++                                fail_instr(state, "RF write after THREND");
++                        } else if (devinfo->ver >= 71 &&
++                                   (inst->sig_addr == 2 ||
++                                    inst->sig_addr == 3)) {
++                                fail_instr(state, "RF2-3 write after THREND");
++                        }
+                 }
+ 
+                 /* GFXH-1625: No TMUWT in the last instruction */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch
new file mode 100644
index 0000000000..50989e8ea6
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0094-v3dv-handle-early-Z-S-clears-for-v71.patch
@@ -0,0 +1,68 @@
+From a751dff57b6d769f5b031054cc65415cc3b44c08 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 08:22:59 +0200
+Subject: [PATCH 094/142] v3dv: handle early Z/S clears for v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 30 ++++++++++++++++++++------
+ 1 file changed, 23 insertions(+), 7 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 48b2e319e51..4580e2a4650 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -998,6 +998,10 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+           * Early-Z/S clearing is independent of Early Z/S testing, so it is
+           * possible to enable one but not the other so long as their
+           * respective requirements are met.
++          *
++          * From V3D 4.5.6, Z/S buffers are always cleared automatically
++          * between tiles, but we still want to enable early ZS clears
++          * when Z/S are not loaded or stored.
+           */
+          struct v3dv_render_pass_attachment *ds_attachment =
+             &pass->attachments[ds_attachment_idx];
+@@ -1005,21 +1009,33 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+          const VkImageAspectFlags ds_aspects =
+             vk_format_aspects(ds_attachment->desc.format);
+ 
+-         bool needs_depth_clear =
+-            check_needs_clear(state,
+-                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+-                              ds_attachment->first_subpass,
+-                              ds_attachment->desc.loadOp,
+-                              subpass->do_depth_clear_with_draw);
+-
+          bool needs_depth_store =
+             v3dv_cmd_buffer_check_needs_store(state,
+                                               ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
+                                               ds_attachment->last_subpass,
+                                               ds_attachment->desc.storeOp) ||
+                                               subpass->resolve_depth;
++#if V3D_VERSION <= 42
++         bool needs_depth_clear =
++            check_needs_clear(state,
++                              ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
++                              ds_attachment->first_subpass,
++                              ds_attachment->desc.loadOp,
++                              subpass->do_depth_clear_with_draw);
+ 
+          do_early_zs_clear = needs_depth_clear && !needs_depth_store;
++#endif
++#if V3D_VERSION >= 71
++         bool needs_depth_load =
++            v3dv_cmd_buffer_check_needs_load(state,
++                                             ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
++                                             ds_attachment->first_subpass,
++                                             ds_attachment->desc.loadOp,
++                                             ds_attachment->last_subpass,
++                                             ds_attachment->desc.storeOp);
++         do_early_zs_clear = !needs_depth_load && !needs_depth_store;
++#endif
++
+          if (do_early_zs_clear &&
+              vk_format_has_stencil(ds_attachment->desc.format)) {
+             bool needs_stencil_load =
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch
new file mode 100644
index 0000000000..11ab68bfb4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0095-v3dv-handle-RTs-with-no-color-targets-in-v71.patch
@@ -0,0 +1,34 @@
+From 2add46ebce4760bf8349606201324ee0e6b1f9da Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 29 Sep 2021 09:07:28 +0200
+Subject: [PATCH 095/142] v3dv: handle RTs with no color targets in v71
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 4580e2a4650..750486a6ccf 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1175,6 +1175,17 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+ #endif
+    }
+ 
++#if V3D_VERSION >= 71
++   /* If we don't have any color RTs, we still need to emit one and flag
++    * it as not used using stride = 1.
++    */
++   if (subpass->color_count == 0) {
++      cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++         rt.stride = 1;
++      }
++   }
++#endif
++
+ #if V3D_VERSION == 42
+    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+       cmd_buffer_render_pass_setup_render_target
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch
new file mode 100644
index 0000000000..10f1c52764
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0096-v3dv-no-specific-separate_segments-flag-for-V3D-7.1.patch
@@ -0,0 +1,85 @@
+From 019abbd34d2d904d6bb33f9fa4433cb53ca7899c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 1 Oct 2021 15:18:38 +0200
+Subject: [PATCH 096/142] v3dv: no specific separate_segments flag for V3D 7.1
+
+On V3D 7.1 there is not a flag on the Shader State Record to specify
+if we are using shared or separate segments. This is done by setting
+the vpm input size to 0 (so we need to ensure that the output would be
+the max needed for input/output).
+
+We were already doing the latter on the prog_data_vs, so we just need
+to use those values, instead of assigning default values.
+
+As we are here, we also add some comments on the compiler part.
+---
+ src/broadcom/compiler/qpu_schedule.c |  4 ++++
+ src/broadcom/compiler/vir.c          |  4 ++++
+ src/broadcom/vulkan/v3dvx_pipeline.c | 15 +++++++++++++--
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
+index 77fb6a794e6..4f767296860 100644
+--- a/src/broadcom/compiler/qpu_schedule.c
++++ b/src/broadcom/compiler/qpu_schedule.c
+@@ -297,6 +297,10 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
+         /* If the input and output segments are shared, then all VPM reads to
+          * a location need to happen before all writes.  We handle this by
+          * serializing all VPM operations for now.
++         *
++         * FIXME: we are assuming that the segments are shared. That is
++         * correct right now as we are only using shared, but technically you
++         * can choose.
+          */
+         bool separate_vpm_segment = false;
+ 
+diff --git a/src/broadcom/compiler/vir.c b/src/broadcom/compiler/vir.c
+index 7612eed7130..dd0aa761c43 100644
+--- a/src/broadcom/compiler/vir.c
++++ b/src/broadcom/compiler/vir.c
+@@ -745,6 +745,10 @@ v3d_vs_set_prog_data(struct v3d_compile *c,
+ 
+         /* Set us up for shared input/output segments.  This is apparently
+          * necessary for our VCM setup to avoid varying corruption.
++         *
++         * FIXME: initially testing on V3D 7.1 seems to work fine when using
++         * separate segments. So we could try to reevaluate in the future, if
++         * there is any advantage of using separate segments.
+          */
+         prog_data->separate_segments = false;
+         prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index a640c1d084a..a72ca3c241b 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -452,14 +452,25 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+          prog_data_vs_bin->separate_segments;
+       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs->separate_segments;
+-#endif
+-
+       shader.coordinate_shader_input_vpm_segment_size =
+          prog_data_vs_bin->separate_segments ?
+          prog_data_vs_bin->vpm_input_size : 1;
+       shader.vertex_shader_input_vpm_segment_size =
+          prog_data_vs->separate_segments ?
+          prog_data_vs->vpm_input_size : 1;
++#endif
++
++      /* On V3D 7.1 there isn't a specific flag to set if we are using
++       * shared/separate segments or not. We just set the value of
++       * vpm_input_size to 0, and set output to the max needed. That should be
++       * already properly set on prog_data_vs_bin
++       */
++#if V3D_VERSION == 71
++      shader.coordinate_shader_input_vpm_segment_size =
++         prog_data_vs_bin->vpm_input_size;
++      shader.vertex_shader_input_vpm_segment_size =
++         prog_data_vs->vpm_input_size;
++#endif
+ 
+       shader.coordinate_shader_output_vpm_segment_size =
+          prog_data_vs_bin->vpm_output_size;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch
new file mode 100644
index 0000000000..d0018b9f0e
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0097-v3dv-don-t-convert-floating-point-border-colors-in-v.patch
@@ -0,0 +1,39 @@
+From 4f6b4f91577ec04aab907d59d836d0c17731a9d0 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 7 Oct 2021 12:43:49 +0200
+Subject: [PATCH 097/142] v3dv: don't convert floating point border colors in
+ v71
+
+The TMU does this for us now.
+---
+ src/broadcom/vulkan/v3dvx_device.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
+index e235983864c..72daefadb08 100644
+--- a/src/broadcom/vulkan/v3dvx_device.c
++++ b/src/broadcom/vulkan/v3dvx_device.c
+@@ -118,7 +118,11 @@ static union pipe_color_union encode_border_color(
+                              (1 << (desc->channel[i].size - 1)) - 1);
+    }
+ 
+-   /* convert from float to expected format */
++#if V3D_VERSION <= 42
++   /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
++    * for us. In V3D 4.x we need to manually convert floating point color
++    * values to the expected format.
++    */
+    if (vk_format_is_srgb(bc_info->format) ||
+        vk_format_is_compressed(bc_info->format)) {
+       for (int i = 0; i < 4; i++)
+@@ -170,6 +174,7 @@ static union pipe_color_union encode_border_color(
+          }
+       }
+    }
++#endif
+ 
+    return border;
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch
new file mode 100644
index 0000000000..aec7084bd4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0098-v3dv-handle-Z-clipping-in-v71.patch
@@ -0,0 +1,60 @@
+From d8083cb8f104e0f035f5b812e000a500fa52d66f Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 15 Oct 2021 13:06:31 +0200
+Subject: [PATCH 098/142] v3dv: handle Z clipping in v71
+
+Fixes the following tests:
+
+dEQP-VK.clipping.clip_volume.*
+dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_* (except deltazero)
+---
+ src/broadcom/vulkan/v3dvx_pipeline.c | 33 ++++++++++++++++++++++++++++
+ 1 file changed, 33 insertions(+)
+
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index a72ca3c241b..7b1133f8173 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -227,6 +227,39 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+          ds_info ? ds_info->stencilTestEnable && has_ds_attachment: false;
+ 
+       pipeline->z_updates_enable = config.z_updates_enable;
++
++#if V3D_VERSION >= 71
++      /* From the Vulkan spec:
++       *
++       *    "depthClampEnable controls whether to clamp the fragment’s depth
++       *     values as described in Depth Test. If the pipeline is not created
++       *     with VkPipelineRasterizationDepthClipStateCreateInfoEXT present
++       *     then enabling depth clamp will also disable clipping primitives to
++       *     the z planes of the frustrum as described in Primitive Clipping.
++       *     Otherwise depth clipping is controlled by the state set in
++       *     VkPipelineRasterizationDepthClipStateCreateInfoEXT."
++       *
++       * Note: neither depth clamping nor VK_EXT_depth_clip_enable are actually
++       * supported in the driver yet, so in practice we are always enabling Z
++       * clipping for now.
++       */
++      bool z_clip_enable = false;
++      const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
++         ds_info ? vk_find_struct_const(ds_info->pNext,
++                                        PIPELINE_RASTERIZATION_DEPTH_CLIP_STATE_CREATE_INFO_EXT) :
++                   NULL;
++      if (clip_info)
++         z_clip_enable = clip_info->depthClipEnable;
++      else if (!(rs_info && rs_info->depthClampEnable))
++         z_clip_enable = true;
++
++      if (z_clip_enable) {
++         config.z_clipping_mode = pipeline->negative_one_to_one ?
++	    V3D_Z_CLIP_MODE_MIN_ONE_TO_ONE : V3D_Z_CLIP_MODE_ZERO_TO_ONE;
++      } else {
++         config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
++      }
++#endif
+    };
+ }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch
new file mode 100644
index 0000000000..d69b668ccf
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0099-broadcom-common-add-TFU-register-definitions-for-v71.patch
@@ -0,0 +1,44 @@
+From 2925fa6dc936d9268a59d8d7d4a775e89fd3fbdb Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 17 Nov 2021 11:33:59 +0100
+Subject: [PATCH 099/142] broadcom/common: add TFU register definitions for v71
+
+---
+ src/broadcom/common/v3d_tfu.h | 23 +++++++++++++++++++++++
+ 1 file changed, 23 insertions(+)
+
+diff --git a/src/broadcom/common/v3d_tfu.h b/src/broadcom/common/v3d_tfu.h
+index 80da224ca2d..572d0074794 100644
+--- a/src/broadcom/common/v3d_tfu.h
++++ b/src/broadcom/common/v3d_tfu.h
+@@ -48,4 +48,27 @@
+ #define V3D33_TFU_ICFG_FORMAT_UIF_NO_XOR 14
+ #define V3D33_TFU_ICFG_FORMAT_UIF_XOR 15
+ 
++/* Disable level 0 write, just write following mipmaps */
++#define V3D71_TFU_IOC_DIMTW (1 << 0)
++#define V3D71_TFU_IOC_FORMAT_SHIFT              12
++#define V3D71_TFU_IOC_FORMAT_LINEARTILE          3
++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_1_COLUMN   4
++#define V3D71_TFU_IOA_FORMAT_UBLINEAR_2_COLUMN   5
++#define V3D71_TFU_IOA_FORMAT_UIF_NO_XOR          6
++#define V3D71_TFU_IOA_FORMAT_UIF_XOR             7
++
++#define V3D71_TFU_IOC_STRIDE_SHIFT              16
++#define V3D71_TFU_IOC_NUMMM_SHIFT                4
++
++#define V3D71_TFU_ICFG_OTYPE_SHIFT              16
++#define V3D71_TFU_ICFG_IFORMAT_SHIFT            23
++#define V3D71_TFU_ICFG_FORMAT_RASTER             0
++#define V3D71_TFU_ICFG_FORMAT_SAND_128           1
++#define V3D71_TFU_ICFG_FORMAT_SAND_256           2
++#define V3D71_TFU_ICFG_FORMAT_LINEARTILE        11
++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_1_COLUMN 12
++#define V3D71_TFU_ICFG_FORMAT_UBLINEAR_2_COLUMN 13
++#define V3D71_TFU_ICFG_FORMAT_UIF_NO_XOR        14
++#define V3D71_TFU_ICFG_FORMAT_UIF_XOR           15
++
+ #endif
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch
new file mode 100644
index 0000000000..8f275d0f02
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0100-broadcom-simulator-TFU-register-names-changed-for-v7.patch
@@ -0,0 +1,67 @@
+From 6d10aa8a64e009d4d1f4f05885621bd2d9a72465 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 23 Sep 2021 13:09:41 +0200
+Subject: [PATCH 100/142] broadcom/simulator: TFU register names changed for
+ v71
+
+---
+ src/broadcom/simulator/v3dx_simulator.c | 39 +++++++++++++++----------
+ 1 file changed, 23 insertions(+), 16 deletions(-)
+
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index f23b0538de3..494f44a6b5d 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -182,26 +182,33 @@ v3d_flush_caches(struct v3d_hw *v3d)
+         v3d_flush_l2t(v3d);
+ }
+ 
++#if V3D_VERSION < 71
++#define TFU_REG(NAME) V3D_TFU_ ## NAME
++#else
++#define TFU_REG(NAME) V3D_IFC_ ## NAME
++#endif
++
++
+ int
+ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
+                                  struct drm_v3d_submit_tfu *args)
+ {
+-        int last_vtct = V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET;
+-
+-        V3D_WRITE(V3D_TFU_IIA, args->iia);
+-        V3D_WRITE(V3D_TFU_IIS, args->iis);
+-        V3D_WRITE(V3D_TFU_ICA, args->ica);
+-        V3D_WRITE(V3D_TFU_IUA, args->iua);
+-        V3D_WRITE(V3D_TFU_IOA, args->ioa);
+-        V3D_WRITE(V3D_TFU_IOS, args->ios);
+-        V3D_WRITE(V3D_TFU_COEF0, args->coef[0]);
+-        V3D_WRITE(V3D_TFU_COEF1, args->coef[1]);
+-        V3D_WRITE(V3D_TFU_COEF2, args->coef[2]);
+-        V3D_WRITE(V3D_TFU_COEF3, args->coef[3]);
+-
+-        V3D_WRITE(V3D_TFU_ICFG, args->icfg);
+-
+-        while ((V3D_READ(V3D_TFU_CS) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
++        int last_vtct = V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET;
++
++        V3D_WRITE(TFU_REG(IIA), args->iia);
++        V3D_WRITE(TFU_REG(IIS), args->iis);
++        V3D_WRITE(TFU_REG(ICA), args->ica);
++        V3D_WRITE(TFU_REG(IUA), args->iua);
++        V3D_WRITE(TFU_REG(IOA), args->ioa);
++        V3D_WRITE(TFU_REG(IOS), args->ios);
++        V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
++        V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
++        V3D_WRITE(TFU_REG(COEF2), args->coef[2]);
++        V3D_WRITE(TFU_REG(COEF3), args->coef[3]);
++
++        V3D_WRITE(TFU_REG(ICFG), args->icfg);
++
++        while ((V3D_READ(TFU_REG(CS)) & V3D_TFU_CS_CVTCT_SET) == last_vtct) {
+                 v3d_hw_tick(v3d);
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch
new file mode 100644
index 0000000000..bf9e2ccdcd
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0101-v3dv-add-support-for-TFU-jobs-in-v71.patch
@@ -0,0 +1,119 @@
+From 780f012747f2cc6e816b1955081dbeca9a0abe5c Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 23 Sep 2021 12:12:18 +0200
+Subject: [PATCH 101/142] v3dv: add support for TFU jobs in v71
+
+---
+ include/drm-uapi/v3d_drm.h              |  5 ++++
+ src/broadcom/simulator/v3dx_simulator.c |  3 ++
+ src/broadcom/vulkan/v3dvx_meta_common.c | 37 +++++++++++++++++++++++++
+ 3 files changed, 45 insertions(+)
+
+diff --git a/include/drm-uapi/v3d_drm.h b/include/drm-uapi/v3d_drm.h
+index 3dfc0af8756..1a7d7a689de 100644
+--- a/include/drm-uapi/v3d_drm.h
++++ b/include/drm-uapi/v3d_drm.h
+@@ -319,6 +319,11 @@ struct drm_v3d_submit_tfu {
+ 
+ 	/* Pointer to an array of ioctl extensions*/
+ 	__u64 extensions;
++
++	struct {
++		__u32 ioc;
++		__u32 pad;
++	} v71;
+ };
+ 
+ /* Submits a compute shader for dispatch.  This job will block on any
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index 494f44a6b5d..4ea177c9bb7 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -200,6 +200,9 @@ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
+         V3D_WRITE(TFU_REG(ICA), args->ica);
+         V3D_WRITE(TFU_REG(IUA), args->iua);
+         V3D_WRITE(TFU_REG(IOA), args->ioa);
++#if V3D_VERSION >= 71
++        V3D_WRITE(TFU_REG(IOC), args->v71.ioc);
++#endif
+         V3D_WRITE(TFU_REG(IOS), args->ios);
+         V3D_WRITE(TFU_REG(COEF0), args->coef[0]);
+         V3D_WRITE(TFU_REG(COEF1), args->coef[1]);
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index 09ebcfa97c1..b8f3297bc94 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -950,6 +950,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+    tfu.iia |= src_offset;
+ 
++#if V3D_VERSION <= 42
+    if (src_tiling == V3D_TILING_RASTER) {
+       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
+    } else {
+@@ -958,12 +959,46 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                    V3D33_TFU_ICFG_FORMAT_SHIFT;
+    }
+    tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
++#endif
++#if V3D_VERSION >= 71
++   if (src_tiling == V3D_TILING_RASTER) {
++      tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
++   } else {
++      tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
++                  (src_tiling - V3D_TILING_LINEARTILE)) <<
++                   V3D71_TFU_ICFG_IFORMAT_SHIFT;
++   }
++   tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
++#endif
+ 
+    tfu.ioa = dst_offset;
+ 
++#if V3D_VERSION <= 42
+    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
+                (dst_tiling - V3D_TILING_LINEARTILE)) <<
+                 V3D33_TFU_IOA_FORMAT_SHIFT;
++#endif
++
++#if V3D_VERSION >= 71
++   tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
++                  (dst_tiling - V3D_TILING_LINEARTILE)) <<
++                   V3D71_TFU_IOC_FORMAT_SHIFT;
++
++   switch (dst_tiling) {
++   case V3D_TILING_UIF_NO_XOR:
++   case V3D_TILING_UIF_XOR:
++      tfu.v71.ioc |=
++         (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
++         V3D71_TFU_IOC_STRIDE_SHIFT;
++      break;
++   case V3D_TILING_RASTER:
++      tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
++                      V3D71_TFU_IOC_STRIDE_SHIFT;
++      break;
++   default:
++      break;
++   }
++#endif
+ 
+    switch (src_tiling) {
+    case V3D_TILING_UIF_NO_XOR:
+@@ -980,6 +1015,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+    /* The TFU can handle raster sources but always produces UIF results */
+    assert(dst_tiling != V3D_TILING_RASTER);
+ 
++#if V3D_VERSION <= 42
+    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+     * OPAD field for the destination (how many extra UIF blocks beyond
+     * those necessary to cover the height).
+@@ -991,6 +1027,7 @@ v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
+                       uif_block_h;
+       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
+    }
++#endif
+ 
+    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch b/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch
new file mode 100644
index 0000000000..946565c402
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0102-v3dv-make-v3dv_viewport_compute_xform-depend-on-the-.patch
@@ -0,0 +1,155 @@
+From 07cba940af2fe0c40641816bee280b57a40973fb Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 20 Oct 2021 11:22:11 +0200
+Subject: [PATCH 102/142] v3dv: make v3dv_viewport_compute_xform depend on the
+ V3D version
+
+For 4.x we have a workaround for too small Z scale values that is
+not required for V3D 7.x.
+---
+ src/broadcom/vulkan/v3dv_cmd_buffer.c  | 40 +++-----------------------
+ src/broadcom/vulkan/v3dv_pipeline.c    |  7 +++--
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 37 ++++++++++++++++++++++++
+ src/broadcom/vulkan/v3dvx_private.h    |  5 ++++
+ 4 files changed, 50 insertions(+), 39 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index 96360a96b44..bda0a614523 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -2131,39 +2131,6 @@ v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
+    }
+ }
+ 
+-/* FIXME: C&P from radv. tu has similar code. Perhaps common place? */
+-void
+-v3dv_viewport_compute_xform(const VkViewport *viewport,
+-                            float scale[3],
+-                            float translate[3])
+-{
+-   float x = viewport->x;
+-   float y = viewport->y;
+-   float half_width = 0.5f * viewport->width;
+-   float half_height = 0.5f * viewport->height;
+-   double n = viewport->minDepth;
+-   double f = viewport->maxDepth;
+-
+-   scale[0] = half_width;
+-   translate[0] = half_width + x;
+-   scale[1] = half_height;
+-   translate[1] = half_height + y;
+-
+-   scale[2] = (f - n);
+-   translate[2] = n;
+-
+-   /* It seems that if the scale is small enough the hardware won't clip
+-    * correctly so we work around this my choosing the smallest scale that
+-    * seems to work.
+-    *
+-    * This case is exercised by CTS:
+-    * dEQP-VK.draw.inverted_depth_ranges.nodepthclamp_deltazero
+-    */
+-   const float min_abs_scale = 0.000009f;
+-   if (fabs(scale[2]) < min_abs_scale)
+-      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
+-}
+-
+ /* Considers the pipeline's negative_one_to_one state and applies it to the
+  * current viewport transform if needed to produce the resulting Z translate
+  * and scale parameters.
+@@ -2216,9 +2183,10 @@ v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
+           viewportCount * sizeof(*pViewports));
+ 
+    for (uint32_t i = firstViewport; i < total_count; i++) {
+-      v3dv_viewport_compute_xform(&state->dynamic.viewport.viewports[i],
+-                                  state->dynamic.viewport.scale[i],
+-                                  state->dynamic.viewport.translate[i]);
++      v3dv_X(cmd_buffer->device, viewport_compute_xform)
++         (&state->dynamic.viewport.viewports[i],
++          state->dynamic.viewport.scale[i],
++          state->dynamic.viewport.translate[i]);
+    }
+ 
+    cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEWPORT;
+diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
+index d012ff8f948..2156176d4cc 100644
+--- a/src/broadcom/vulkan/v3dv_pipeline.c
++++ b/src/broadcom/vulkan/v3dv_pipeline.c
+@@ -2661,9 +2661,10 @@ pipeline_init_dynamic_state(
+                       pViewportState->viewportCount);
+ 
+          for (uint32_t i = 0; i < dynamic->viewport.count; i++) {
+-            v3dv_viewport_compute_xform(&dynamic->viewport.viewports[i],
+-                                        dynamic->viewport.scale[i],
+-                                        dynamic->viewport.translate[i]);
++            v3dv_X(pipeline->device, viewport_compute_xform)
++               (&dynamic->viewport.viewports[i],
++                dynamic->viewport.scale[i],
++                dynamic->viewport.translate[i]);
+          }
+       }
+ 
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 750486a6ccf..f7c13a22423 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1285,6 +1285,43 @@ v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
+    cl_emit(rcl, END_OF_RENDERING, end);
+ }
+ 
++void
++v3dX(viewport_compute_xform)(const VkViewport *viewport,
++                            float scale[3],
++                            float translate[3])
++{
++   float x = viewport->x;
++   float y = viewport->y;
++   float half_width = 0.5f * viewport->width;
++   float half_height = 0.5f * viewport->height;
++   double n = viewport->minDepth;
++   double f = viewport->maxDepth;
++
++   scale[0] = half_width;
++   translate[0] = half_width + x;
++   scale[1] = half_height;
++   translate[1] = half_height + y;
++
++   scale[2] = (f - n);
++   translate[2] = n;
++
++   /* It seems that if the scale is small enough the hardware won't clip
++    * correctly so we work around this my choosing the smallest scale that
++    * seems to work.
++    *
++    * This case is exercised by CTS:
++    * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
++    *
++    * V3D 7.x fixes this by using the new
++    * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
++    */
++#if V3D_VERSION <= 42
++   const float min_abs_scale = 0.0005f;
++   if (fabs(scale[2]) < min_abs_scale)
++      scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index 036ce11b455..81715520913 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -339,3 +339,8 @@ v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+ uint32_t
+ v3dX(clamp_for_format_and_type)(uint32_t rt_type,
+                                 VkFormat vk_format);
++
++void
++v3dX(viewport_compute_xform)(const VkViewport *viewport,
++                             float scale[3],
++                             float translate[3]);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch b/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch
new file mode 100644
index 0000000000..82f934720c
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0103-v3dv-fix-depth-clipping-then-Z-scale-is-too-small-in.patch
@@ -0,0 +1,51 @@
+From c6b60ee47c50474030f8a0a92bd4c6a071f926dc Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 14 Feb 2023 10:09:53 +0100
+Subject: [PATCH 103/142] v3dv: fix depth clipping then Z scale is too small in
+ V3D 7.x
+
+When the Z scale is too small guardband clipping may not clip
+correctly, so disable it, which is a new option in V3D 7.x.
+
+This fixes this test in V3D 7.x without needing any workarounds:
+dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index f7c13a22423..3566649aafd 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1363,10 +1363,28 @@ v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
+    v3dv_cmd_buffer_state_get_viewport_z_xform(&cmd_buffer->state, 0,
+                                               &translate_z, &scale_z);
+ 
++#if V3D_VERSION == 42
+    cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+       clip.viewport_z_offset_zc_to_zs = translate_z;
+       clip.viewport_z_scale_zc_to_zs = scale_z;
+    }
++#endif
++
++#if V3D_VERSION >= 71
++   /* If the Z scale is too small guardband clipping may not clip correctly */
++   if (fabsf(scale_z) < 0.01f) {
++      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
++         clip.viewport_z_offset_zc_to_zs = translate_z;
++         clip.viewport_z_scale_zc_to_zs = scale_z;
++      }
++   } else {
++      cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
++         clip.viewport_z_offset_zc_to_zs = translate_z;
++         clip.viewport_z_scale_zc_to_zs = scale_z;
++      }
++   }
++#endif
++
+    cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
+       /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
+        * we are using OpenGL's [-1, 1] instead.
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch b/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch
new file mode 100644
index 0000000000..83c6351641
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0104-v3d-add-a-non-conformant-warning-for-not-fully-suppo.patch
@@ -0,0 +1,30 @@
+From 46e2b22f43290e6fe92f5435af174c4b18bb6ef5 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 22:52:47 +0200
+Subject: [PATCH 104/142] v3d: add a non-conformant warning for not fully
+ supported hw
+
+---
+ src/gallium/drivers/v3d/v3d_screen.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
+index 98ca9bb69e6..efdb7d615ae 100644
+--- a/src/gallium/drivers/v3d/v3d_screen.c
++++ b/src/gallium/drivers/v3d/v3d_screen.c
+@@ -922,6 +922,12 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
+         if (!v3d_get_device_info(screen->fd, &screen->devinfo, &v3d_ioctl))
+                 goto fail;
+ 
++        if (screen->devinfo.ver >= 71) {
++                fprintf(stderr, "WARNING: v3d support for hw version %i is neither "
++                        "a complete nor a conformant OpenGL implementation. Testing "
++                        "use only.\n", screen->devinfo.ver);
++        }
++
+         driParseConfigFiles(config->options, config->options_info, 0, "v3d",
+                             NULL, NULL, NULL, 0, NULL, 0);
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch b/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch
new file mode 100644
index 0000000000..07bed87a0c
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0105-v3d-add-v71-hw-generation.patch
@@ -0,0 +1,336 @@
+From 46ffdc57ac7fbe71e92b22e1fe93185f3d33a3ac Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 23 May 2023 23:32:37 +0200
+Subject: [PATCH 105/142] v3d: add v71 hw generation
+
+Starting point for v71 version inclusion:
+ * Adds as one of the versions to be compiled on meson
+ * Updated the v3d_X and v3dX macros to include version 71
+ * Update the code enough to get it building when using v71.
+
+Any real v71 support will be implemented on following commits.
+---
+ src/gallium/drivers/v3d/meson.build   |  2 +-
+ src/gallium/drivers/v3d/v3d_context.h | 22 +++++++++++++----
+ src/gallium/drivers/v3d/v3dx_draw.c   | 21 +++++++++++++---
+ src/gallium/drivers/v3d/v3dx_emit.c   | 11 +++++++++
+ src/gallium/drivers/v3d/v3dx_rcl.c    | 35 ++++++++++++++++++++++-----
+ src/gallium/drivers/v3d/v3dx_state.c  | 12 +++++++++
+ 6 files changed, 88 insertions(+), 15 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
+index dfa1e88097b..526a131ae9b 100644
+--- a/src/gallium/drivers/v3d/meson.build
++++ b/src/gallium/drivers/v3d/meson.build
+@@ -58,7 +58,7 @@ if dep_v3dv3.found()
+   v3d_args += '-DUSE_V3D_SIMULATOR'
+ endif
+ 
+-v3d_versions = ['33', '42']
++v3d_versions = ['33', '42', '71']
+ 
+ per_version_libs = []
+ foreach ver : v3d_versions
+diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
+index 97850b0363e..ad267d5033c 100644
+--- a/src/gallium/drivers/v3d/v3d_context.h
++++ b/src/gallium/drivers/v3d/v3d_context.h
+@@ -818,13 +818,21 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
+ 
+ /* Helper to call hw ver specific functions */
+ #define v3d_X(devinfo, thing) ({                                \
+-        __typeof(&v3d42_##thing) v3d_X_thing;                   \
+-        if ((devinfo)->ver >= 42)                               \
+-                v3d_X_thing = &v3d42_##thing;                   \
+-        else if ((devinfo)->ver >= 33)                          \
++        __typeof(&v3d33_##thing) v3d_X_thing;                   \
++        switch (devinfo->ver) {                                 \
++        case 33:                                                \
++        case 40:                                                \
+                 v3d_X_thing = &v3d33_##thing;                   \
+-        else                                                    \
++                break;                                          \
++        case 42:                                                \
++                v3d_X_thing = &v3d42_##thing;                   \
++                break;                                          \
++        case 71:                                                \
++                v3d_X_thing = &v3d71_##thing;                   \
++                break;                                          \
++        default:                                                \
+                 unreachable("Unsupported hardware generation"); \
++        }                                                       \
+         v3d_X_thing;                                            \
+ })
+ 
+@@ -838,6 +846,10 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
+ #  define v3dX(x) v3d42_##x
+ #  include "v3dx_context.h"
+ #  undef v3dX
++
++#  define v3dX(x) v3d71_##x
++#  include "v3dx_context.h"
++#  undef v3dX
+ #endif
+ 
+ #endif /* V3D_CONTEXT_H */
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 17442500ea9..2c74c5973c9 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -95,7 +95,11 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ #endif
+ 
+         assert(!job->msaa || !job->double_buffer);
+-#if V3D_VERSION >= 40
++#if V3D_VERSION >= 71
++        unreachable("HW generation 71 not supported yet.");
++#endif
++
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+         cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
+                 config.width_in_pixels = job->draw_width;
+                 config.height_in_pixels = job->draw_height;
+@@ -107,7 +111,8 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ 
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+         }
+-#else /* V3D_VERSION < 40 */
++#endif
++#if V3D_VERSION < 40
+         /* "Binning mode lists start with a Tile Binning Mode Configuration
+          * item (120)"
+          *
+@@ -134,7 +139,7 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ 
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+         }
+-#endif /* V3D_VERSION < 40 */
++#endif
+ 
+         /* There's definitely nothing in the VCD cache we want. */
+         cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
+@@ -655,10 +660,15 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 /* XXX: Use combined input/output size flag in the common
+                  * case.
+                  */
++#if V3D_VERSION <= 42
+                 shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
+                         v3d->prog.cs->prog_data.vs->separate_segments;
+                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+                         v3d->prog.vs->prog_data.vs->separate_segments;
++#endif
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+                 shader.coordinate_shader_input_vpm_segment_size =
+                         v3d->prog.cs->prog_data.vs->separate_segments ?
+@@ -724,9 +734,14 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 shader.instance_id_read_by_vertex_shader =
+                         v3d->prog.vs->prog_data.vs->uses_iid;
+ 
++#if V3D_VERSION <= 42
+                 shader.address_of_default_attribute_values =
+                         cl_address(v3d_resource(vtx->defaults)->bo,
+                                    vtx->defaults_offset);
++#endif
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
+         }
+ 
+         bool cs_loaded_any = false;
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 0ad3fb68b1e..5af3d03b337 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -512,6 +512,7 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                         /* Note: EZ state may update based on the compiled FS,
+                          * along with ZSA
+                          */
++#if V3D_VERSION <= 42
+                         config.early_z_updates_enable =
+                                 (job->ez_state != V3D_EZ_DISABLED);
+                         if (v3d->zsa->base.depth_enabled) {
+@@ -524,6 +525,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                         } else {
+                                 config.depth_test_function = PIPE_FUNC_ALWAYS;
+                         }
++#endif
++#if V3D_VERSION >= 71
++                        unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+                         config.stencil_enable =
+                                 v3d->zsa->base.stencil[0].enabled;
+@@ -564,12 +569,18 @@ v3dX(emit_state)(struct pipe_context *pctx)
+         }
+ 
+         if (v3d->dirty & V3D_DIRTY_VIEWPORT) {
++#if V3D_VERSION <= 42
+                 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
+                         clip.viewport_half_width_in_1_256th_of_pixel =
+                                 v3d->viewport.scale[0] * 256.0f;
+                         clip.viewport_half_height_in_1_256th_of_pixel =
+                                 v3d->viewport.scale[1] * 256.0f;
+                 }
++#endif
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
++
+ 
+                 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
+                         clip.viewport_z_offset_zc_to_zs =
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 82547437c25..166cc34e4ee 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -419,10 +419,16 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
+          * clearing Z/S.
+          */
+         if (job->clear) {
++#if V3D_VERSION <= 42
+                 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
+                         clear.clear_z_stencil_buffer = !job->early_zs_clear;
+                         clear.clear_all_render_targets = true;
+                 }
++#endif
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
++
+         }
+ #endif /* V3D_VERSION >= 40 */
+ }
+@@ -483,7 +489,7 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
+         }
+ }
+ 
+-#if V3D_VERSION >= 40
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+ static void
+ v3d_setup_render_target(struct v3d_job *job, int cbuf,
+                         uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
+@@ -507,9 +513,9 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
+         else
+                 *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
+ }
++#endif
+ 
+-#else /* V3D_VERSION < 40 */
+-
++#if V3D_VERSION < 40
+ static void
+ v3d_emit_z_stencil_config(struct v3d_job *job, struct v3d_surface *surf,
+                           struct v3d_resource *rsc, bool is_separate_stencil)
+@@ -656,7 +662,8 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+         cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                 store.buffer_to_store = NONE;
+         }
+-#else
++#endif
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+         for (int i = 0; i < 2; i++) {
+                 if (i > 0)
+                         cl_emit(&job->rcl, TILE_COORDINATES, coords);
+@@ -673,6 +680,10 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+                 cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
+         }
+ #endif
++#if V3D_VERSION >= 71
++        unreachable("HW generation 71 not supported yet.");
++#endif
++
+ 
+         cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+ 
+@@ -775,7 +786,13 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 config.multisample_mode_4x = job->msaa;
+                 config.double_buffer_in_non_ms_mode = job->double_buffer;
+ 
++#if V3D_VERSION <= 42
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
++#endif
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
++
+         }
+ 
+         for (int i = 0; i < job->nr_cbufs; i++) {
+@@ -786,7 +803,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 struct v3d_resource *rsc = v3d_resource(psurf->texture);
+ 
+                 UNUSED uint32_t config_pad = 0;
+-                uint32_t clear_pad = 0;
++                UNUSED uint32_t clear_pad = 0;
+ 
+                 /* XXX: Set the pad for raster. */
+                 if (surf->tiling == V3D_TILING_UIF_NO_XOR ||
+@@ -819,6 +836,7 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 }
+ #endif /* V3D_VERSION < 40 */
+ 
++#if V3D_VERSION <= 42
+                 cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1,
+                         clear) {
+                         clear.clear_color_low_32_bits = job->clear_color[i][0];
+@@ -847,9 +865,10 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                                 clear.render_target_number = i;
+                         };
+                 }
++#endif
+         }
+ 
+-#if V3D_VERSION >= 40
++#if V3D_VERSION >= 40 && V3D_VERSION <= 42
+         cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
+                 v3d_setup_render_target(job, 0,
+                                         &rt.render_target_0_internal_bpp,
+@@ -870,6 +889,10 @@ v3dX(emit_rcl)(struct v3d_job *job)
+         }
+ #endif
+ 
++#if V3D_VERSION >= 71
++        unreachable("HW generation 71 not supported yet.");
++#endif
++
+ #if V3D_VERSION < 40
+         /* FIXME: Don't bother emitting if we don't load/clear Z/S. */
+         if (job->zsbuf) {
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 0f1735fee66..a93d5be091e 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -990,7 +990,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+                                                                    cso->u.buf.size);
+                 }
+ 
++#if V3D_VERSION <= 42
+                 tex.srgb = util_format_is_srgb(cso->format);
++#endif
++
++#if V3D_VERSION >= 71
++                unreachable("HW generation 71 not supported yet.");
++#endif
+ 
+ #if V3D_VERSION >= 40
+                 tex.swizzle_r = v3d_translate_pipe_swizzle(so->swizzle[0]);
+@@ -1040,7 +1046,13 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+                          * shader code if we wanted to read an MSAA sRGB
+                          * texture without sRGB decode.
+                          */
++#if V3D_VERSION <= 42
+                         tex.srgb = false;
++#endif
++#if V3D_VERSION >= 71
++                        unreachable("HW generation 71 not supported yet.");
++#endif
++
+                 } else {
+                         tex.texture_type = v3d_get_tex_format(&screen->devinfo,
+                                                               cso->format);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch
new file mode 100644
index 0000000000..dafba1550e
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0106-v3d-emit-TILE_BINNING_MODE_CFG-for-v71.patch
@@ -0,0 +1,39 @@
+From 1ef6241854666a00d43401039809f2470d3a2cc0 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 20 Oct 2021 14:31:10 +0200
+Subject: [PATCH 106/142] v3d: emit TILE_BINNING_MODE_CFG for v71
+
+---
+ src/gallium/drivers/v3d/v3dx_draw.c | 16 +++++++++++++++-
+ 1 file changed, 15 insertions(+), 1 deletion(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 2c74c5973c9..9f38baa0bbf 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -96,7 +96,21 @@ v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
+ 
+         assert(!job->msaa || !job->double_buffer);
+ #if V3D_VERSION >= 71
+-        unreachable("HW generation 71 not supported yet.");
++        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
++                config.width_in_pixels = job->draw_width;
++                config.height_in_pixels = job->draw_height;
++
++                config.log2_tile_width = log2_tile_size(job->tile_width);
++                config.log2_tile_height = log2_tile_size(job->tile_height);
++
++                /* FIXME: ideallly we would like next assert on the packet header (as is
++                 * general, so also applies to GL). We would need to expand
++                 * gen_pack_header for that.
++                 */
++                assert(config.log2_tile_width == config.log2_tile_height ||
++                       config.log2_tile_width == config.log2_tile_height + 1);
++        }
++
+ #endif
+ 
+ #if V3D_VERSION >= 40 && V3D_VERSION <= 42
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch
new file mode 100644
index 0000000000..f3bfe3eac3
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0107-v3d-emit-TILE_RENDERING_MODE_CFG_COMMON-for-v71.patch
@@ -0,0 +1,44 @@
+From dfdfcf3853d7178acff288a368dfc169018c186a Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 20 Oct 2021 14:42:43 +0200
+Subject: [PATCH 107/142] v3d: emit TILE_RENDERING_MODE_CFG_COMMON for v71
+
+---
+ src/gallium/drivers/v3d/v3dx_rcl.c | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 166cc34e4ee..3f5eb293c4e 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -23,8 +23,9 @@
+ 
+ #include "util/format/u_format.h"
+ #include "v3d_context.h"
+-#include "broadcom/common/v3d_tiling.h"
+ #include "broadcom/common/v3d_macros.h"
++#include "broadcom/common/v3d_tiling.h"
++#include "broadcom/common/v3d_util.h"
+ #include "broadcom/cle/v3dx_pack.h"
+ 
+ #define PIPE_CLEAR_COLOR_BUFFERS (PIPE_CLEAR_COLOR0 |                   \
+@@ -790,7 +791,15 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                 config.maximum_bpp_of_all_render_targets = job->internal_bpp;
+ #endif
+ #if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
++                config.log2_tile_width = log2_tile_size(job->tile_width);
++                config.log2_tile_height = log2_tile_size(job->tile_height);
++
++                /* FIXME: ideallly we would like next assert on the packet header (as is
++                 * general, so also applies to GL). We would need to expand
++                 * gen_pack_header for that.
++                 */
++                assert(config.log2_tile_width == config.log2_tile_height ||
++                       config.log2_tile_width == config.log2_tile_height + 1);
+ #endif
+ 
+         }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch b/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch
new file mode 100644
index 0000000000..de56d89812
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0108-v3d-TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1.patch
@@ -0,0 +1,186 @@
+From 34b32f1ee504449e39529110631c389fa9e9e409 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 20 Oct 2021 15:12:15 +0200
+Subject: [PATCH 108/142] v3d: TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1
+
+---
+ src/gallium/drivers/v3d/v3dx_rcl.c | 130 +++++++++++++++++++++++++----
+ 1 file changed, 115 insertions(+), 15 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 3f5eb293c4e..815e1098c22 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -490,10 +490,86 @@ v3d_rcl_emit_generic_per_tile_list(struct v3d_job *job, int layer)
+         }
+ }
+ 
++#if V3D_VERSION > 33
++/* Note that for v71, render target cfg packets has just one field that
++ * combined the internal type and clamp mode. For simplicity we keep just one
++ * helper.
++ *
++ * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
++ *
++ */
++static uint32_t
++v3dX(clamp_for_format_and_type)(uint32_t rt_type,
++                                enum pipe_format format)
++{
++#if V3D_VERSION == 42
++        if (util_format_is_pure_integer(format)) {
++                return V3D_RENDER_TARGET_CLAMP_INT;
++        } else if (util_format_is_srgb(format)) {
++                return V3D_RENDER_TARGET_CLAMP_NORM;
++        } else {
++                return V3D_RENDER_TARGET_CLAMP_NONE;
++        }
++#endif
++#if V3D_VERSION >= 71
++        switch (rt_type) {
++        case V3D_INTERNAL_TYPE_8I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
++        case V3D_INTERNAL_TYPE_8UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_8:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_8;
++        case V3D_INTERNAL_TYPE_16I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
++        case V3D_INTERNAL_TYPE_16UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_16F:
++                return util_format_is_srgb(format) ?
++                        V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
++                        V3D_RENDER_TARGET_TYPE_CLAMP_16F;
++        case V3D_INTERNAL_TYPE_32I:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
++        case V3D_INTERNAL_TYPE_32UI:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
++        case V3D_INTERNAL_TYPE_32F:
++                return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
++        default:
++                unreachable("Unknown internal render target type");
++        }
++        return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
++#endif
++        return 0;
++}
++#endif
++
++#if V3D_VERSION >= 71
++static void
++v3d_setup_render_target(struct v3d_job *job,
++                        int cbuf,
++                        uint32_t *rt_bpp,
++                        uint32_t *rt_type_clamp)
++{
++        if (!job->cbufs[cbuf])
++                return;
++
++        struct v3d_surface *surf = v3d_surface(job->cbufs[cbuf]);
++        *rt_bpp = surf->internal_bpp;
++        if (job->bbuf) {
++           struct v3d_surface *bsurf = v3d_surface(job->bbuf);
++           *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
++        }
++        *rt_type_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
++                                                         surf->base.format);
++}
++#endif
++
+ #if V3D_VERSION >= 40 && V3D_VERSION <= 42
+ static void
+-v3d_setup_render_target(struct v3d_job *job, int cbuf,
+-                        uint32_t *rt_bpp, uint32_t *rt_type, uint32_t *rt_clamp)
++v3d_setup_render_target(struct v3d_job *job,
++                        int cbuf,
++                        uint32_t *rt_bpp,
++                        uint32_t *rt_type,
++                        uint32_t *rt_clamp)
+ {
+         if (!job->cbufs[cbuf])
+                 return;
+@@ -505,14 +581,8 @@ v3d_setup_render_target(struct v3d_job *job, int cbuf,
+            *rt_bpp = MAX2(*rt_bpp, bsurf->internal_bpp);
+         }
+         *rt_type = surf->internal_type;
+-        if (util_format_is_srgb(surf->base.format))
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NORM;
+-#if V3D_VERSION >= 42
+-        else if (util_format_is_pure_integer(surf->base.format))
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_INT;
+-#endif
+-        else
+-                *rt_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
++        *rt_clamp = v3dX(clamp_for_format_and_type)(surf->internal_type,
++                                                    surf->base.format);
+ }
+ #endif
+ 
+@@ -804,10 +874,30 @@ v3dX(emit_rcl)(struct v3d_job *job)
+ 
+         }
+ 
++#if V3D_VERSION >= 71
++        uint32_t base_addr = 0;
++
++        /* If we don't have any color RTs, we sill need to emit one and flat
++         * it as not used using stride = 1
++         */
++        if (job->nr_cbufs == 0) {
++           cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++              rt.stride = 1; /* Unused */
++           }
++        }
++#endif
+         for (int i = 0; i < job->nr_cbufs; i++) {
+                 struct pipe_surface *psurf = job->cbufs[i];
+-                if (!psurf)
++                if (!psurf) {
++#if V3D_VERSION >= 71
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++                                rt.render_target_number = i;
++                                rt.stride = 1; /* Unused */
++                        }
++#endif
+                         continue;
++                }
++
+                 struct v3d_surface *surf = v3d_surface(psurf);
+                 struct v3d_resource *rsc = v3d_resource(psurf->texture);
+ 
+@@ -874,6 +964,20 @@ v3dX(emit_rcl)(struct v3d_job *job)
+                                 clear.render_target_number = i;
+                         };
+                 }
++#endif
++#if V3D_VERSION >= 71
++                cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
++                        rt.clear_color_low_bits = job->clear_color[i][0];
++                        v3d_setup_render_target(job, i, &rt.internal_bpp,
++                                                &rt.internal_type_and_clamping);
++                        rt.stride =
++                                v3d_compute_rt_row_row_stride_128_bits(job->tile_width,
++                                                                       v3d_internal_bpp_words(rt.internal_bpp));
++                        rt.base_address = base_addr;
++                        rt.render_target_number = i;
++
++                        base_addr += (job->tile_height * rt.stride) / 8;
++                }
+ #endif
+         }
+ 
+@@ -898,10 +1002,6 @@ v3dX(emit_rcl)(struct v3d_job *job)
+         }
+ #endif
+ 
+-#if V3D_VERSION >= 71
+-        unreachable("HW generation 71 not supported yet.");
+-#endif
+-
+ #if V3D_VERSION < 40
+         /* FIXME: Don't bother emitting if we don't load/clear Z/S. */
+         if (job->zsbuf) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch
new file mode 100644
index 0000000000..fbb87ab660
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0109-v3d-emit-CLEAR_RENDER_TARGETS-for-v71.patch
@@ -0,0 +1,60 @@
+From 8496282476420e7e5d9d31f6cfd87f3f3b136446 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 01:47:29 +0200
+Subject: [PATCH 109/142] v3d: emit CLEAR_RENDER_TARGETS for v71
+
+---
+ src/gallium/drivers/v3d/v3dx_rcl.c | 14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 815e1098c22..4274be042bd 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -427,7 +427,7 @@ v3d_rcl_emit_stores(struct v3d_job *job, struct v3d_cl *cl, int layer)
+                 }
+ #endif
+ #if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
++                cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
+ #endif
+ 
+         }
+@@ -734,7 +734,7 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+                 store.buffer_to_store = NONE;
+         }
+ #endif
+-#if V3D_VERSION >= 40 && V3D_VERSION <= 42
++#if V3D_VERSION >= 40
+         for (int i = 0; i < 2; i++) {
+                 if (i > 0)
+                         cl_emit(&job->rcl, TILE_COORDINATES, coords);
+@@ -742,20 +742,20 @@ emit_render_layer(struct v3d_job *job, uint32_t layer)
+                 cl_emit(&job->rcl, STORE_TILE_BUFFER_GENERAL, store) {
+                         store.buffer_to_store = NONE;
+                 }
++
+                 if (i == 0 || do_double_initial_tile_clear(job)) {
++#if V3D_VERSION < 71
+                         cl_emit(&job->rcl, CLEAR_TILE_BUFFERS, clear) {
+                                 clear.clear_z_stencil_buffer = !job->early_zs_clear;
+                                 clear.clear_all_render_targets = true;
+                         }
++#else
++                        cl_emit(&job->rcl, CLEAR_RENDER_TARGETS, clear);
++#endif
+                 }
+                 cl_emit(&job->rcl, END_OF_TILE_MARKER, end);
+         }
+ #endif
+-#if V3D_VERSION >= 71
+-        unreachable("HW generation 71 not supported yet.");
+-#endif
+-
+-
+         cl_emit(&job->rcl, FLUSH_VCD_CACHE, flush);
+ 
+         v3d_rcl_emit_generic_per_tile_list(job, layer);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch b/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch
new file mode 100644
index 0000000000..e3dbb971af
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0110-v3d-just-don-t-fill-up-early-z-fields-for-CFG_BITS-f.patch
@@ -0,0 +1,43 @@
+From 4de1ace1c7b3b6436a5de8e4c6a2f52d6308ff5c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 13:09:03 +0200
+Subject: [PATCH 110/142] v3d: just don't fill up early-z fields for CFG_BITS
+ for v71
+
+v71 doesn't include early_z_enable/early_z_updates_enable. They are
+configured with packet 121.
+---
+ src/gallium/drivers/v3d/v3dx_emit.c | 7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 5af3d03b337..de05ae29d04 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -515,20 +515,19 @@ v3dX(emit_state)(struct pipe_context *pctx)
+ #if V3D_VERSION <= 42
+                         config.early_z_updates_enable =
+                                 (job->ez_state != V3D_EZ_DISABLED);
++#endif
+                         if (v3d->zsa->base.depth_enabled) {
+                                 config.z_updates_enable =
+                                         v3d->zsa->base.depth_writemask;
++#if V3D_VERSION <= 42
+                                 config.early_z_enable =
+                                         config.early_z_updates_enable;
++#endif
+                                 config.depth_test_function =
+                                         v3d->zsa->base.depth_func;
+                         } else {
+                                 config.depth_test_function = PIPE_FUNC_ALWAYS;
+                         }
+-#endif
+-#if V3D_VERSION >= 71
+-                        unreachable("HW generation 71 not supported yet.");
+-#endif
+ 
+                         config.stencil_enable =
+                                 v3d->zsa->base.stencil[0].enabled;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch
new file mode 100644
index 0000000000..78e45af498
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0111-v3d-emit-CLIPPER_XY_SCALING-for-v71.patch
@@ -0,0 +1,30 @@
+From 0683f6db1cd50659829fe53f49427bfdacb707b6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 13:14:32 +0200
+Subject: [PATCH 111/142] v3d: emit CLIPPER_XY_SCALING for v71
+
+---
+ src/gallium/drivers/v3d/v3dx_emit.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index de05ae29d04..58c886bb29e 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -577,7 +577,12 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                 }
+ #endif
+ #if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
++                cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
++                        clip.viewport_half_width_in_1_64th_of_pixel =
++                                v3d->viewport.scale[0] * 64.0f;
++                        clip.viewport_half_height_in_1_64th_of_pixel =
++                                v3d->viewport.scale[1] * 64.0f;
++                }
+ #endif
+ 
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch
new file mode 100644
index 0000000000..cf420be0f5
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0112-v3d-no-specific-separate_segments-flag-for-V3D-7.1.patch
@@ -0,0 +1,53 @@
+From 1d1aa5ce739644c72b44ffe547b7233ad19e26b5 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 13:19:49 +0200
+Subject: [PATCH 112/142] v3d: no specific separate_segments flag for V3D 7.1
+
+On V3D 7.1 there is not a flag on the Shader State Record to specify
+if we are using shared or separate segments. This is done by setting
+the vpm input size to 0 (so we need to ensure that the output would be
+the max needed for input/output).
+
+We were already doing the latter on the prog_data_vs, so we just need
+to use those values, instead of assigning default values.
+---
+ src/gallium/drivers/v3d/v3dx_draw.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 9f38baa0bbf..dd13e5177fe 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -679,17 +679,24 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                         v3d->prog.cs->prog_data.vs->separate_segments;
+                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+                         v3d->prog.vs->prog_data.vs->separate_segments;
+-#endif
+-#if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
+-#endif
+-
+                 shader.coordinate_shader_input_vpm_segment_size =
+                         v3d->prog.cs->prog_data.vs->separate_segments ?
+                         v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
+                 shader.vertex_shader_input_vpm_segment_size =
+                         v3d->prog.vs->prog_data.vs->separate_segments ?
+                         v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
++#endif
++                /* On V3D 7.1 there isn't a specific flag to set if we are using
++                 * shared/separate segments or not. We just set the value of
++                 * vpm_input_size to 0, and set output to the max needed. That should be
++                 * already properly set on prog_data_vs_bin
++                 */
++#if V3D_VERSION == 71
++                shader.coordinate_shader_input_vpm_segment_size =
++                        v3d->prog.cs->prog_data.vs->vpm_input_size;
++                shader.vertex_shader_input_vpm_segment_size =
++                        v3d->prog.vs->prog_data.vs->vpm_input_size;
++#endif
+ 
+                 shader.coordinate_shader_output_vpm_segment_size =
+                         v3d->prog.cs->prog_data.vs->vpm_output_size;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch b/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch
new file mode 100644
index 0000000000..b3e7369ea0
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0113-v3d-default-vertex-attributes-values-are-not-needed-.patch
@@ -0,0 +1,113 @@
+From 3a790ddd27c8406c59426599fb9cadb5de5c024d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 13:37:46 +0200
+Subject: [PATCH 113/142] v3d: default vertex attributes values are not needed
+ for v71
+
+---
+ src/gallium/drivers/v3d/v3d_context.h |  1 +
+ src/gallium/drivers/v3d/v3dx_draw.c   |  3 --
+ src/gallium/drivers/v3d/v3dx_state.c  | 53 ++++++++++++++++++---------
+ 3 files changed, 37 insertions(+), 20 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
+index ad267d5033c..c0aac741fdc 100644
+--- a/src/gallium/drivers/v3d/v3d_context.h
++++ b/src/gallium/drivers/v3d/v3d_context.h
+@@ -265,6 +265,7 @@ struct v3d_vertex_stateobj {
+         unsigned num_elements;
+ 
+         uint8_t attrs[16 * (V3D_MAX_VS_INPUTS / 4)];
++        /* defaults can be NULL for some hw generation */
+         struct pipe_resource *defaults;
+         uint32_t defaults_offset;
+ };
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index dd13e5177fe..4bff2ea6478 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -759,9 +759,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 shader.address_of_default_attribute_values =
+                         cl_address(v3d_resource(vtx->defaults)->bo,
+                                    vtx->defaults_offset);
+-#endif
+-#if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
+ #endif
+         }
+ 
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index a93d5be091e..3d3c4fb0f47 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -337,6 +337,20 @@ v3d_zsa_state_bind(struct pipe_context *pctx, void *hwcso)
+         v3d->dirty |= V3D_DIRTY_ZSA;
+ }
+ 
++
++static bool
++needs_default_attribute_values(void)
++{
++#if V3D_VERSION <= 42
++        /* FIXME: on vulkan we are able to refine even further, as we know in
++         * advance when we create the pipeline if we have a integer vertex
++         * attrib. Pending to check if we could do something similar here.
++         */
++        return true;
++#endif
++        return false;
++}
++
+ static void *
+ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+                         const struct pipe_vertex_element *elements)
+@@ -414,24 +428,29 @@ v3d_vertex_state_create(struct pipe_context *pctx, unsigned num_elements,
+                 }
+         }
+ 
+-        /* Set up the default attribute values in case any of the vertex
+-         * elements use them.
+-         */
+-        uint32_t *attrs;
+-        u_upload_alloc(v3d->state_uploader, 0,
+-                       V3D_MAX_VS_INPUTS * sizeof(float), 16,
+-                       &so->defaults_offset, &so->defaults, (void **)&attrs);
+-
+-        for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
+-                attrs[i * 4 + 0] = 0;
+-                attrs[i * 4 + 1] = 0;
+-                attrs[i * 4 + 2] = 0;
+-                if (i < so->num_elements &&
+-                    util_format_is_pure_integer(so->pipe[i].src_format)) {
+-                        attrs[i * 4 + 3] = 1;
+-                } else {
+-                        attrs[i * 4 + 3] = fui(1.0);
++        if (needs_default_attribute_values()) {
++                /* Set up the default attribute values in case any of the vertex
++                 * elements use them.
++                 */
++                uint32_t *attrs;
++                u_upload_alloc(v3d->state_uploader, 0,
++                               V3D_MAX_VS_INPUTS * sizeof(float), 16,
++                               &so->defaults_offset, &so->defaults, (void **)&attrs);
++
++                for (int i = 0; i < V3D_MAX_VS_INPUTS / 4; i++) {
++                        attrs[i * 4 + 0] = 0;
++                        attrs[i * 4 + 1] = 0;
++                        attrs[i * 4 + 2] = 0;
++                        if (i < so->num_elements &&
++                            util_format_is_pure_integer(so->pipe[i].src_format)) {
++                                attrs[i * 4 + 3] = 1;
++                        } else {
++                                attrs[i * 4 + 3] = fui(1.0);
++                        }
+                 }
++        } else {
++                so->defaults = NULL;
++                so->defaults_offset = 0;
+         }
+ 
+         u_upload_unmap(v3d->state_uploader);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch b/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch
new file mode 100644
index 0000000000..d197620253
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0114-v3d-uniforms-update-VIEWPORT_X-Y_SCALE-uniforms-for-.patch
@@ -0,0 +1,100 @@
+From 8e3a2a35df5789687993d05436602821186e1cf2 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 13:46:11 +0200
+Subject: [PATCH 114/142] v3d/uniforms: update VIEWPORT_X/Y_SCALE uniforms for
+ v71
+
+As the packet CLIPPER_XY scaling, this needs to be computed on 1/64ths
+of pixel, instead of 1/256ths of pixels.
+
+As this is the usual values that we get from macros, we add manually a
+v42 and v71 macro, and define a new helper to get those.
+
+Those granularity values are the same for Vulkan and OpenGL, so
+perhaps we should move them to a common place.
+
+As with v3dv, V3D_X macro name is somewhat confusing. It is
+specifically created to ask for define values that depends on the
+version. But I also felt that V3D_DEFINE_X was too long.
+---
+ src/gallium/drivers/v3d/v3d_context.h  | 28 ++++++++++++++++++++++++++
+ src/gallium/drivers/v3d/v3d_uniforms.c |  8 ++++++--
+ 2 files changed, 34 insertions(+), 2 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
+index c0aac741fdc..21ee10a90cc 100644
+--- a/src/gallium/drivers/v3d/v3d_context.h
++++ b/src/gallium/drivers/v3d/v3d_context.h
+@@ -837,6 +837,34 @@ void v3d_disk_cache_store(struct v3d_context *v3d,
+         v3d_X_thing;                                            \
+ })
+ 
++/* FIXME: The same for vulkan/opengl. Common place? define it at the
++ * v3d_packet files?
++ */
++#define V3D33_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D42_CLIPPER_XY_GRANULARITY 256.0f
++#define V3D71_CLIPPER_XY_GRANULARITY 64.0f
++
++/* Helper to get hw-specific macro values */
++#define V3DV_X(devinfo, thing) ({                               \
++   __typeof(V3D33_##thing) V3D_X_THING;                         \
++   switch (devinfo->ver) {                                      \
++   case 33:                                                     \
++   case 40:                                                     \
++      V3D_X_THING = V3D33_##thing;                              \
++      break;                                                    \
++      case 41:                                                  \
++   case 42:                                                     \
++      V3D_X_THING = V3D42_##thing;                              \
++      break;                                                    \
++   case 71:                                                     \
++      V3D_X_THING = V3D71_##thing;                              \
++      break;                                                    \
++   default:                                                     \
++      unreachable("Unsupported hardware generation");           \
++   }                                                            \
++   V3D_X_THING;                                                 \
++})
++
+ #ifdef v3dX
+ #  include "v3dx_context.h"
+ #else
+diff --git a/src/gallium/drivers/v3d/v3d_uniforms.c b/src/gallium/drivers/v3d/v3d_uniforms.c
+index 95eb838954f..1b8758bae7d 100644
+--- a/src/gallium/drivers/v3d/v3d_uniforms.c
++++ b/src/gallium/drivers/v3d/v3d_uniforms.c
+@@ -261,6 +261,7 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
+                    struct v3d_compiled_shader *shader,
+                    enum pipe_shader_type stage)
+ {
++        struct v3d_device_info *devinfo = &v3d->screen->devinfo;
+         struct v3d_constbuf_stateobj *cb = &v3d->constbuf[stage];
+         struct v3d_texture_stateobj *texstate = &v3d->tex[stage];
+         struct v3d_uniform_list *uinfo = &shader->prog_data.base->uniforms;
+@@ -282,6 +283,9 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
+         struct v3d_cl_out *uniforms =
+                 cl_start(&job->indirect);
+ 
++        float clipper_xy_granularity =
++                V3DV_X(devinfo, CLIPPER_XY_GRANULARITY);
++
+         for (int i = 0; i < uinfo->count; i++) {
+                 uint32_t data = uinfo->data[i];
+ 
+@@ -293,10 +297,10 @@ v3d_write_uniforms(struct v3d_context *v3d, struct v3d_job *job,
+                         cl_aligned_u32(&uniforms, gallium_uniforms[data]);
+                         break;
+                 case QUNIFORM_VIEWPORT_X_SCALE:
+-                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * 256.0f);
++                        cl_aligned_f(&uniforms, v3d->viewport.scale[0] * clipper_xy_granularity);
+                         break;
+                 case QUNIFORM_VIEWPORT_Y_SCALE:
+-                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * 256.0f);
++                        cl_aligned_f(&uniforms, v3d->viewport.scale[1] * clipper_xy_granularity);
+                         break;
+ 
+                 case QUNIFORM_VIEWPORT_Z_OFFSET:
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch b/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch
new file mode 100644
index 0000000000..e9f5e92927
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0115-v3d-handle-new-texture-state-transfer-functions-in-v.patch
@@ -0,0 +1,43 @@
+From aa6f70116d9e7be56cdb52b55d75419bf7209185 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Thu, 21 Oct 2021 23:21:02 +0200
+Subject: [PATCH 115/142] v3d: handle new texture state transfer functions in
+ v71
+
+---
+ src/gallium/drivers/v3d/v3dx_state.c | 9 +++------
+ 1 file changed, 3 insertions(+), 6 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 3d3c4fb0f47..b5e572b13c5 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -1009,12 +1009,12 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+                                                                    cso->u.buf.size);
+                 }
+ 
++                bool is_srgb = util_format_is_srgb(cso->format);
+ #if V3D_VERSION <= 42
+-                tex.srgb = util_format_is_srgb(cso->format);
++                tex.srgb = is_srgb;
+ #endif
+-
+ #if V3D_VERSION >= 71
+-                unreachable("HW generation 71 not supported yet.");
++                tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+ #endif
+ 
+ #if V3D_VERSION >= 40
+@@ -1068,9 +1068,6 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+ #if V3D_VERSION <= 42
+                         tex.srgb = false;
+ #endif
+-#if V3D_VERSION >= 71
+-                        unreachable("HW generation 71 not supported yet.");
+-#endif
+ 
+                 } else {
+                         tex.texture_type = v3d_get_tex_format(&screen->devinfo,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch b/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch
new file mode 100644
index 0000000000..2ce6d66bd2
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0116-v3d-handle-new-TEXTURE_SHADER_STATE-v71-YCbCr-fields.patch
@@ -0,0 +1,62 @@
+From aefc98b6aefc38caa6f6efd421db6d02c42596a7 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 22 Oct 2021 10:54:24 +0200
+Subject: [PATCH 116/142] v3d: handle new TEXTURE_SHADER_STATE v71 YCbCr fields
+
+There are some new fields for YCbCr with pointers for the various
+planes in multi-planar formats. These need to match the base address
+pointer in the texture state, or the hardware will assume this is a
+multi-planar texture.
+
+Notice we don't use an address type for these fields in the XML
+description. This is because the addresses are 64-bit aligned (even
+though the PRM doesn't say it) which means the 6 LSB bits are
+implicitly 0, but the fields are encoded before the 6th bit of their
+starting byte, so we can't use the usual trick we do with address
+types where the first 6 bits in the byte are implicitly overwritten by
+other fields and we have to encode this manually as a uint field. This
+would mean that if we had an actual BO we would also need to add it
+manually to the job's list, but since we don't have one, we don't have
+to do anything about it.
+---
+ src/gallium/drivers/v3d/v3dx_state.c | 17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index b5e572b13c5..c08a072157b 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -936,17 +936,26 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+         }
+ 
+         tex->base_level = base_level;
++
+ #if V3D_VERSION >= 40
+         tex->max_level = last_level;
+         /* Note that we don't have a job to reference the texture's sBO
+          * at state create time, so any time this sampler view is used
+          * we need to add the texture to the job.
+          */
+-        tex->texture_base_pointer =
+-                cl_address(NULL,
+-                           rsc->bo->offset +
+-                           v3d_layer_offset(prsc, 0, first_layer));
++        const uint32_t base_offset = rsc->bo->offset +
++                v3d_layer_offset(prsc, 0, first_layer);
++
++        tex->texture_base_pointer = cl_address(NULL, base_offset);
+ #endif
++#if V3D_VERSION >= 71
++        tex->chroma_offset_x = 1;
++        tex->chroma_offset_y = 1;
++        /* See comment in XML field definition for rationale of the shifts */
++        tex->texture_base_pointer_cb = base_offset >> 6;
++        tex->texture_base_pointer_cr = base_offset >> 6;
++#endif
++
+         tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
+ 
+         /* Since other platform devices may produce UIF images even
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch b/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch
new file mode 100644
index 0000000000..5f7cdbd03f
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0117-v3d-setup-render-pass-color-clears-for-any-format-bp.patch
@@ -0,0 +1,42 @@
+From fcb3fc1ead4344da59c4b26a81878d53f8f4a291 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 22 Oct 2021 11:40:49 +0200
+Subject: [PATCH 117/142] v3d: setup render pass color clears for any format
+ bpp in v71
+
+---
+ src/gallium/drivers/v3d/v3dx_rcl.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_rcl.c b/src/gallium/drivers/v3d/v3dx_rcl.c
+index 4274be042bd..d3fbc9aff5d 100644
+--- a/src/gallium/drivers/v3d/v3dx_rcl.c
++++ b/src/gallium/drivers/v3d/v3dx_rcl.c
+@@ -978,6 +978,24 @@ v3dX(emit_rcl)(struct v3d_job *job)
+ 
+                         base_addr += (job->tile_height * rt.stride) / 8;
+                 }
++
++                if (surf->internal_bpp >= V3D_INTERNAL_BPP_64) {
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
++                                rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
++                                        ((uint64_t) job->clear_color[i][1]) |
++                                        (((uint64_t) (job->clear_color[i][2] & 0xff)) << 32);
++                                rt.render_target_number = i;
++                        }
++                }
++
++                if (surf->internal_bpp >= V3D_INTERNAL_BPP_128) {
++                        cl_emit(&job->rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
++                                rt.clear_color_top_bits = /* 56 bits (24 + 32) */
++                                        (((uint64_t) (job->clear_color[i][2] & 0xffffff00)) >> 8) |
++                                        (((uint64_t) (job->clear_color[i][3])) << 24);
++                                rt.render_target_number = i;
++                        }
++                }
+ #endif
+         }
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch
new file mode 100644
index 0000000000..56e27cf09c
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0118-v3d-GFX-1461-does-not-affect-V3D-7.x.patch
@@ -0,0 +1,29 @@
+From ceb088c05f351b40df14069bd6e0de777288ece4 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 22 Oct 2021 12:17:45 +0200
+Subject: [PATCH 118/142] v3d: GFX-1461 does not affect V3D 7.x
+
+---
+ src/gallium/drivers/v3d/v3dx_draw.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 4bff2ea6478..04cc3bc3ae1 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -1593,9 +1593,10 @@ v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
+         /* GFXH-1461: If we were to emit a load of just depth or just stencil,
+          * then the clear for the other may get lost.  We need to decide now
+          * if it would be possible to need to emit a load of just one after
+-         * we've set up our TLB clears.
++         * we've set up our TLB clears. This issue is fixed since V3D 4.3.18.
+          */
+-        if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
++        if (v3d->screen->devinfo.ver <= 42 &&
++            buffers & PIPE_CLEAR_DEPTHSTENCIL &&
+             (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
+             job->zsbuf &&
+             util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch b/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch
new file mode 100644
index 0000000000..c3cdfc0355
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0119-v3d-don-t-convert-floating-point-border-colors-in-v7.patch
@@ -0,0 +1,55 @@
+From b44a8785c5436fb28b6734d3bac806d3a82c828d Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 22 Oct 2021 13:41:09 +0200
+Subject: [PATCH 119/142] v3d: don't convert floating point border colors in
+ v71
+
+The TMU does this for us now.
+---
+ src/gallium/drivers/v3d/v3dx_state.c | 29 ++++++++++++++--------------
+ 1 file changed, 15 insertions(+), 14 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index c08a072157b..348a7bcf3da 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -718,21 +718,22 @@ v3d_upload_sampler_state_variant(void *map,
+                                 break;
+                         }
+ 
+-                        if (variant >= V3D_SAMPLER_STATE_32) {
+-                                sampler.border_color_word_0 = border.ui[0];
+-                                sampler.border_color_word_1 = border.ui[1];
+-                                sampler.border_color_word_2 = border.ui[2];
+-                                sampler.border_color_word_3 = border.ui[3];
+-                        } else {
+-                                sampler.border_color_word_0 =
+-                                        _mesa_float_to_half(border.f[0]);
+-                                sampler.border_color_word_1 =
+-                                        _mesa_float_to_half(border.f[1]);
+-                                sampler.border_color_word_2 =
+-                                        _mesa_float_to_half(border.f[2]);
+-                                sampler.border_color_word_3 =
+-                                        _mesa_float_to_half(border.f[3]);
++#if V3D_VERSION <= 42
++                        /* The TMU in V3D 7.x always takes 32-bit floats and handles conversions
++                         * for us. In V3D 4.x we need to manually convert floating point color
++                         * values to the expected format.
++                         */
++                        if (variant < V3D_SAMPLER_STATE_32) {
++                                border.ui[0] = _mesa_float_to_half(border.f[0]);
++                                border.ui[1] = _mesa_float_to_half(border.f[1]);
++                                border.ui[2] = _mesa_float_to_half(border.f[2]);
++                                border.ui[3] = _mesa_float_to_half(border.f[3]);
+                         }
++#endif
++                        sampler.border_color_word_0 = border.ui[0];
++                        sampler.border_color_word_1 = border.ui[1];
++                        sampler.border_color_word_2 = border.ui[2];
++                        sampler.border_color_word_3 = border.ui[3];
+                 }
+         }
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch
new file mode 100644
index 0000000000..ef5d2ade88
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0120-v3d-handle-Z-clipping-in-v71.patch
@@ -0,0 +1,39 @@
+From ecc1a5fa6b09a684a1e831c342121ec417f1a101 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 22 Oct 2021 14:26:29 +0200
+Subject: [PATCH 120/142] v3d: handle Z clipping in v71
+
+---
+ src/gallium/drivers/v3d/v3dx_emit.c | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 58c886bb29e..75751dc9ab6 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -539,8 +539,21 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                                 v3d_line_smoothing_enabled(v3d) ?
+                                 V3D_LINE_RASTERIZATION_PERP_END_CAPS :
+                                 V3D_LINE_RASTERIZATION_DIAMOND_EXIT;
+-                }
+ 
++#if V3D_VERSION >= 71
++                        /* The following follows the logic implemented at v3dv
++                         * plus the definition of depth_clip_near/far and
++                         * depth_clamp.
++                         *
++                         * Note: some extensions are not supported by v3d
++                         * (like ARB_depth_clamp) that would affect this, but
++                         * the values on rasterizer are taking that into
++                         * account.
++                         */
++                        config.z_clipping_mode = v3d->rasterizer->base.depth_clip_near ||
++                           v3d->rasterizer->base.depth_clip_far;
++#endif
++                }
+         }
+ 
+         if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch
new file mode 100644
index 0000000000..8275072cbe
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0121-v3d-add-support-for-TFU-blit-in-v71.patch
@@ -0,0 +1,446 @@
+From ecac3d8441b75011446b566320194df17beba352 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Wed, 27 Oct 2021 02:03:10 +0200
+Subject: [PATCH 121/142] v3d: add support for TFU blit in v71
+
+TFU has changed on v71, specially on which registers to use, so that
+means that support code change across versions. So as part of this
+commit TFU copying is moved to a v3dx file.
+---
+ src/gallium/drivers/v3d/meson.build    |   1 +
+ src/gallium/drivers/v3d/v3d_blit.c     | 164 +++-----------------
+ src/gallium/drivers/v3d/v3dx_context.h |  10 ++
+ src/gallium/drivers/v3d/v3dx_tfu.c     | 202 +++++++++++++++++++++++++
+ 4 files changed, 232 insertions(+), 145 deletions(-)
+ create mode 100644 src/gallium/drivers/v3d/v3dx_tfu.c
+
+diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
+index 526a131ae9b..b2e748573b7 100644
+--- a/src/gallium/drivers/v3d/meson.build
++++ b/src/gallium/drivers/v3d/meson.build
+@@ -49,6 +49,7 @@ files_per_version = files(
+   'v3dx_job.c',
+   'v3dx_rcl.c',
+   'v3dx_state.c',
++  'v3dx_tfu.c',
+ )
+ 
+ v3d_args = ['-DV3D_BUILD_NEON']
+diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
+index 0260bdde6d1..96179f654a4 100644
+--- a/src/gallium/drivers/v3d/v3d_blit.c
++++ b/src/gallium/drivers/v3d/v3d_blit.c
+@@ -210,140 +210,6 @@ v3d_stencil_blit(struct pipe_context *ctx, struct pipe_blit_info *info)
+         info->mask &= ~PIPE_MASK_S;
+ }
+ 
+-static bool
+-v3d_tfu(struct pipe_context *pctx,
+-        struct pipe_resource *pdst,
+-        struct pipe_resource *psrc,
+-        unsigned int src_level,
+-        unsigned int base_level,
+-        unsigned int last_level,
+-        unsigned int src_layer,
+-        unsigned int dst_layer,
+-        bool for_mipmap)
+-{
+-        struct v3d_context *v3d = v3d_context(pctx);
+-        struct v3d_screen *screen = v3d->screen;
+-        struct v3d_resource *src = v3d_resource(psrc);
+-        struct v3d_resource *dst = v3d_resource(pdst);
+-        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
+-        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
+-        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
+-        int width = u_minify(pdst->width0, base_level) * msaa_scale;
+-        int height = u_minify(pdst->height0, base_level) * msaa_scale;
+-        enum pipe_format pformat;
+-
+-        if (psrc->format != pdst->format)
+-                return false;
+-        if (psrc->nr_samples != pdst->nr_samples)
+-                return false;
+-
+-        /* Can't write to raster. */
+-        if (dst_base_slice->tiling == V3D_TILING_RASTER)
+-                return false;
+-
+-        /* When using TFU for blit, we are doing exact copies (both input and
+-         * output format must be the same, no scaling, etc), so there is no
+-         * pixel format conversions. Thus we can rewrite the format to use one
+-         * that is TFU compatible based on its texel size.
+-         */
+-        if (for_mipmap) {
+-                pformat = pdst->format;
+-        } else {
+-                switch (dst->cpp) {
+-                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
+-                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
+-                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
+-                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
+-                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
+-                default: unreachable("unsupported format bit-size"); break;
+-                };
+-        }
+-
+-        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
+-        struct v3d_device_info *devinfo = &screen->devinfo;
+-
+-        if (!v3d_X(devinfo, tfu_supports_tex_format)(tex_format, for_mipmap)) {
+-                assert(for_mipmap);
+-                return false;
+-        }
+-
+-        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
+-        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
+-
+-        struct drm_v3d_submit_tfu tfu = {
+-                .ios = (height << 16) | width,
+-                .bo_handles = {
+-                        dst->bo->handle,
+-                        src != dst ? src->bo->handle : 0
+-                },
+-                .in_sync = v3d->out_sync,
+-                .out_sync = v3d->out_sync,
+-        };
+-        uint32_t src_offset = (src->bo->offset +
+-                               v3d_layer_offset(psrc, src_level, src_layer));
+-        tfu.iia |= src_offset;
+-        if (src_base_slice->tiling == V3D_TILING_RASTER) {
+-                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
+-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+-        } else {
+-                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
+-                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+-                             V3D33_TFU_ICFG_FORMAT_SHIFT);
+-        }
+-
+-        uint32_t dst_offset = (dst->bo->offset +
+-                               v3d_layer_offset(pdst, base_level, dst_layer));
+-        tfu.ioa |= dst_offset;
+-        if (last_level != base_level)
+-                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
+-        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
+-                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
+-                    V3D33_TFU_IOA_FORMAT_SHIFT);
+-
+-        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
+-        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
+-
+-        switch (src_base_slice->tiling) {
+-        case V3D_TILING_UIF_NO_XOR:
+-        case V3D_TILING_UIF_XOR:
+-                tfu.iis |= (src_base_slice->padded_height /
+-                            (2 * v3d_utile_height(src->cpp)));
+-                break;
+-        case V3D_TILING_RASTER:
+-                tfu.iis |= src_base_slice->stride / src->cpp;
+-                break;
+-        case V3D_TILING_LINEARTILE:
+-        case V3D_TILING_UBLINEAR_1_COLUMN:
+-        case V3D_TILING_UBLINEAR_2_COLUMN:
+-                break;
+-       }
+-
+-        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
+-         * OPAD field for the destination (how many extra UIF blocks beyond
+-         * those necessary to cover the height).  When filling mipmaps, the
+-         * miplevel 1+ tiling state is inferred.
+-         */
+-        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
+-            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
+-                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
+-                int implicit_padded_height = align(height, uif_block_h);
+-
+-                tfu.icfg |= (((dst_base_slice->padded_height -
+-                               implicit_padded_height) / uif_block_h) <<
+-                             V3D33_TFU_ICFG_OPAD_SHIFT);
+-        }
+-
+-        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
+-        if (ret != 0) {
+-                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
+-                return false;
+-        }
+-
+-        dst->writes++;
+-
+-        return true;
+-}
+-
+ bool
+ v3d_generate_mipmap(struct pipe_context *pctx,
+                     struct pipe_resource *prsc,
+@@ -362,12 +228,16 @@ v3d_generate_mipmap(struct pipe_context *pctx,
+         if (first_layer != last_layer)
+                 return false;
+ 
+-        return v3d_tfu(pctx,
+-                       prsc, prsc,
+-                       base_level,
+-                       base_level, last_level,
+-                       first_layer, first_layer,
+-                       true);
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        return v3d_X(devinfo, tfu)(pctx,
++                                   prsc, prsc,
++                                   base_level,
++                                   base_level, last_level,
++                                   first_layer, first_layer,
++                                   true);
+ }
+ 
+ static void
+@@ -396,11 +266,15 @@ v3d_tfu_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
+         if (info->dst.format != info->src.format)
+                 return;
+ 
+-        if (v3d_tfu(pctx, info->dst.resource, info->src.resource,
+-                    info->src.level,
+-                    info->dst.level, info->dst.level,
+-                    info->src.box.z, info->dst.box.z,
+-                    false)) {
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        if (v3d_X(devinfo, tfu)(pctx, info->dst.resource, info->src.resource,
++                                info->src.level,
++                                info->dst.level, info->dst.level,
++                                info->src.box.z, info->dst.box.z,
++                                false)) {
+                 info->mask &= ~PIPE_MASK_RGBA;
+         }
+ }
+diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
+index 03d7c244ea2..e0a5cbfb2f3 100644
+--- a/src/gallium/drivers/v3d/v3dx_context.h
++++ b/src/gallium/drivers/v3d/v3dx_context.h
+@@ -51,3 +51,13 @@ void v3dX(get_internal_type_bpp_for_output_format)(uint32_t format,
+  */
+ bool v3dX(tfu_supports_tex_format)(uint32_t tex_format,
+                                    bool for_mipmap);
++
++bool v3dX(tfu)(struct pipe_context *pctx,
++               struct pipe_resource *pdst,
++               struct pipe_resource *psrc,
++               unsigned int src_level,
++               unsigned int base_level,
++               unsigned int last_level,
++               unsigned int src_layer,
++               unsigned int dst_layer,
++               bool for_mipmap);
+diff --git a/src/gallium/drivers/v3d/v3dx_tfu.c b/src/gallium/drivers/v3d/v3dx_tfu.c
+new file mode 100644
+index 00000000000..d6b51390a11
+--- /dev/null
++++ b/src/gallium/drivers/v3d/v3dx_tfu.c
+@@ -0,0 +1,202 @@
++/*
++ * Copyright © 2021 Broadcom
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "v3d_context.h"
++#include "broadcom/common/v3d_tfu.h"
++
++bool
++v3dX(tfu)(struct pipe_context *pctx,
++          struct pipe_resource *pdst,
++          struct pipe_resource *psrc,
++          unsigned int src_level,
++          unsigned int base_level,
++          unsigned int last_level,
++          unsigned int src_layer,
++          unsigned int dst_layer,
++          bool for_mipmap)
++{
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_resource *src = v3d_resource(psrc);
++        struct v3d_resource *dst = v3d_resource(pdst);
++        struct v3d_resource_slice *src_base_slice = &src->slices[src_level];
++        struct v3d_resource_slice *dst_base_slice = &dst->slices[base_level];
++        int msaa_scale = pdst->nr_samples > 1 ? 2 : 1;
++        int width = u_minify(pdst->width0, base_level) * msaa_scale;
++        int height = u_minify(pdst->height0, base_level) * msaa_scale;
++        enum pipe_format pformat;
++
++        if (psrc->format != pdst->format)
++                return false;
++        if (psrc->nr_samples != pdst->nr_samples)
++                return false;
++
++        if (pdst->target != PIPE_TEXTURE_2D || psrc->target != PIPE_TEXTURE_2D)
++                return false;
++
++        /* Can't write to raster. */
++        if (dst_base_slice->tiling == V3D_TILING_RASTER)
++                return false;
++
++        /* When using TFU for blit, we are doing exact copies (both input and
++         * output format must be the same, no scaling, etc), so there is no
++         * pixel format conversions. Thus we can rewrite the format to use one
++         * that is TFU compatible based on its texel size.
++         */
++        if (for_mipmap) {
++                pformat = pdst->format;
++        } else {
++                switch (dst->cpp) {
++                case 16: pformat = PIPE_FORMAT_R32G32B32A32_FLOAT;   break;
++                case 8:  pformat = PIPE_FORMAT_R16G16B16A16_FLOAT;   break;
++                case 4:  pformat = PIPE_FORMAT_R32_FLOAT;            break;
++                case 2:  pformat = PIPE_FORMAT_R16_FLOAT;            break;
++                case 1:  pformat = PIPE_FORMAT_R8_UNORM;             break;
++                default: unreachable("unsupported format bit-size"); break;
++                };
++        }
++
++        uint32_t tex_format = v3d_get_tex_format(&screen->devinfo, pformat);
++
++        if (!v3dX(tfu_supports_tex_format)(tex_format, for_mipmap)) {
++                assert(for_mipmap);
++                return false;
++        }
++
++        v3d_flush_jobs_writing_resource(v3d, psrc, V3D_FLUSH_DEFAULT, false);
++        v3d_flush_jobs_reading_resource(v3d, pdst, V3D_FLUSH_DEFAULT, false);
++
++        struct drm_v3d_submit_tfu tfu = {
++                .ios = (height << 16) | width,
++                .bo_handles = {
++                        dst->bo->handle,
++                        src != dst ? src->bo->handle : 0
++                },
++                .in_sync = v3d->out_sync,
++                .out_sync = v3d->out_sync,
++        };
++        uint32_t src_offset = (src->bo->offset +
++                               v3d_layer_offset(psrc, src_level, src_layer));
++        tfu.iia |= src_offset;
++
++        uint32_t dst_offset = (dst->bo->offset +
++                               v3d_layer_offset(pdst, base_level, dst_layer));
++        tfu.ioa |= dst_offset;
++
++        switch (src_base_slice->tiling) {
++        case V3D_TILING_UIF_NO_XOR:
++        case V3D_TILING_UIF_XOR:
++                tfu.iis |= (src_base_slice->padded_height /
++                            (2 * v3d_utile_height(src->cpp)));
++                break;
++        case V3D_TILING_RASTER:
++                tfu.iis |= src_base_slice->stride / src->cpp;
++                break;
++        case V3D_TILING_LINEARTILE:
++        case V3D_TILING_UBLINEAR_1_COLUMN:
++        case V3D_TILING_UBLINEAR_2_COLUMN:
++                break;
++       }
++
++#if V3D_VERSION <= 42
++        if (src_base_slice->tiling == V3D_TILING_RASTER) {
++                tfu.icfg |= (V3D33_TFU_ICFG_FORMAT_RASTER <<
++                             V3D33_TFU_ICFG_FORMAT_SHIFT);
++        } else {
++                tfu.icfg |= ((V3D33_TFU_ICFG_FORMAT_LINEARTILE +
++                              (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                             V3D33_TFU_ICFG_FORMAT_SHIFT);
++        }
++        tfu.icfg |= tex_format << V3D33_TFU_ICFG_TTYPE_SHIFT;
++
++        if (last_level != base_level)
++                tfu.ioa |= V3D33_TFU_IOA_DIMTW;
++
++        tfu.ioa |= ((V3D33_TFU_IOA_FORMAT_LINEARTILE +
++                     (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                    V3D33_TFU_IOA_FORMAT_SHIFT);
++
++        tfu.icfg |= (last_level - base_level) << V3D33_TFU_ICFG_NUMMM_SHIFT;
++
++        /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
++         * OPAD field for the destination (how many extra UIF blocks beyond
++         * those necessary to cover the height).  When filling mipmaps, the
++         * miplevel 1+ tiling state is inferred.
++         */
++        if (dst_base_slice->tiling == V3D_TILING_UIF_NO_XOR ||
++            dst_base_slice->tiling == V3D_TILING_UIF_XOR) {
++                int uif_block_h = 2 * v3d_utile_height(dst->cpp);
++                int implicit_padded_height = align(height, uif_block_h);
++
++                tfu.icfg |= (((dst_base_slice->padded_height -
++                               implicit_padded_height) / uif_block_h) <<
++                             V3D33_TFU_ICFG_OPAD_SHIFT);
++        }
++#endif /* V3D_VERSION <= 42 */
++
++#if V3D_VERSION >= 71
++        if (src_base_slice->tiling == V3D_TILING_RASTER) {
++                tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
++        } else {
++                tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
++                            (src_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                        V3D71_TFU_ICFG_IFORMAT_SHIFT;
++        }
++        tfu.icfg |= tex_format << V3D71_TFU_ICFG_OTYPE_SHIFT;
++
++        if (last_level != base_level)
++                tfu.v71.ioc |= V3D71_TFU_IOC_DIMTW;
++
++        tfu.v71.ioc |= ((V3D71_TFU_IOC_FORMAT_LINEARTILE +
++                         (dst_base_slice->tiling - V3D_TILING_LINEARTILE)) <<
++                        V3D71_TFU_IOC_FORMAT_SHIFT);
++
++        switch (dst_base_slice->tiling) {
++        case V3D_TILING_UIF_NO_XOR:
++        case V3D_TILING_UIF_XOR:
++                tfu.v71.ioc |=
++                        (dst_base_slice->padded_height / (2 * v3d_utile_height(dst->cpp))) <<
++                        V3D71_TFU_IOC_STRIDE_SHIFT;
++                break;
++        case V3D_TILING_RASTER:
++                tfu.v71.ioc |= (dst_base_slice->padded_height / dst->cpp) <<
++                        V3D71_TFU_IOC_STRIDE_SHIFT;
++                break;
++        default:
++                break;
++        }
++
++        tfu.v71.ioc |= (last_level - base_level) << V3D71_TFU_IOC_NUMMM_SHIFT;
++#endif /* V3D_VERSION >= 71*/
++
++        int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_TFU, &tfu);
++        if (ret != 0) {
++                fprintf(stderr, "Failed to submit TFU job: %d\n", ret);
++                return false;
++        }
++
++        dst->writes++;
++
++        return true;
++}
++
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch b/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch
new file mode 100644
index 0000000000..105a224f18
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0122-v3d-v3dv-fix-texture-state-array-stride-packing-for-.patch
@@ -0,0 +1,91 @@
+From ed7e118a6cc0c9bba9f02929e98bc51252331950 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 16 May 2023 00:28:27 +0200
+Subject: [PATCH 122/142] v3d/v3dv: fix texture state array stride packing for
+ V3D 7.1.5
+
+---
+ src/broadcom/vulkan/v3dvx_image.c    |  7 +++++++
+ src/gallium/drivers/v3d/v3dx_state.c | 20 +++++++++++++++-----
+ 2 files changed, 22 insertions(+), 5 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index 437d4588c7e..ae6eaa88d0c 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -118,6 +118,13 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+ #endif
+ #if V3D_VERSION >= 71
+          tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
++
++         /* V3D 7.1.5 has array stride starting one bit later than previous
++          * V3D versions to make room for the new RB swap bit, but we don't
++          * handle that in the CLE parser.
++          */
++         if (device->devinfo.rev >= 5)
++            tex.array_stride_64_byte_aligned <<= 1;
+ #endif
+ 
+          /* At this point we don't have the job. That's the reason the first
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 348a7bcf3da..88e57cd072b 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -889,7 +889,8 @@ v3d_setup_texture_shader_state_from_buffer(struct V3DX(TEXTURE_SHADER_STATE) *te
+ }
+ 
+ static void
+-v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
++v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
++                               struct V3DX(TEXTURE_SHADER_STATE) *tex,
+                                struct pipe_resource *prsc,
+                                int base_level, int last_level,
+                                int first_layer, int last_layer,
+@@ -949,15 +950,22 @@ v3d_setup_texture_shader_state(struct V3DX(TEXTURE_SHADER_STATE) *tex,
+ 
+         tex->texture_base_pointer = cl_address(NULL, base_offset);
+ #endif
++
++        tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
++
+ #if V3D_VERSION >= 71
+         tex->chroma_offset_x = 1;
+         tex->chroma_offset_y = 1;
+         /* See comment in XML field definition for rationale of the shifts */
+         tex->texture_base_pointer_cb = base_offset >> 6;
+         tex->texture_base_pointer_cr = base_offset >> 6;
+-#endif
+ 
+-        tex->array_stride_64_byte_aligned = rsc->cube_map_stride / 64;
++        /* V3D 7.1.5 has array stride start at bit 33 instead of bit 32 to
++         * make room for the RB swap bit.
++         */
++        if (devinfo->rev >= 5)
++                tex->array_stride_64_byte_aligned <<= 1;
++#endif
+ 
+         /* Since other platform devices may produce UIF images even
+          * when they're not big enough for V3D to assume they're UIF,
+@@ -1006,7 +1014,8 @@ v3dX(create_texture_shader_state_bo)(struct v3d_context *v3d,
+ 
+         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
+                 if (prsc->target != PIPE_BUFFER) {
+-                        v3d_setup_texture_shader_state(&tex, prsc,
++                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
++                                                       &tex, prsc,
+                                                        cso->u.tex.first_level,
+                                                        cso->u.tex.last_level,
+                                                        cso->u.tex.first_layer,
+@@ -1442,7 +1451,8 @@ v3d_create_image_view_texture_shader_state(struct v3d_context *v3d,
+ 
+         v3dx_pack(map, TEXTURE_SHADER_STATE, tex) {
+                 if (prsc->target != PIPE_BUFFER) {
+-                        v3d_setup_texture_shader_state(&tex, prsc,
++                        v3d_setup_texture_shader_state(&v3d->screen->devinfo,
++                                                       &tex, prsc,
+                                                        iview->base.u.tex.level,
+                                                        iview->base.u.tex.level,
+                                                        iview->base.u.tex.first_layer,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch b/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch
new file mode 100644
index 0000000000..19cffa9495
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0123-v3d-v3dv-support-up-to-8-render-targets-in-v7.1.patch
@@ -0,0 +1,499 @@
+From 48893b056a07b7eda4fe3dea7f068c403981b621 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 12 Nov 2021 10:35:59 +0100
+Subject: [PATCH 123/142] v3d,v3dv: support up to 8 render targets in v7.1+
+
+---
+ src/broadcom/common/v3d_limits.h       |  3 +-
+ src/broadcom/common/v3d_util.c         | 49 ++++++++++++++++++++++++--
+ src/broadcom/common/v3d_util.h         |  6 ++--
+ src/broadcom/compiler/nir_to_vir.c     | 10 +++---
+ src/broadcom/vulkan/v3dv_cmd_buffer.c  |  5 +--
+ src/broadcom/vulkan/v3dv_device.c      |  6 ++--
+ src/broadcom/vulkan/v3dv_limits.h      |  2 --
+ src/broadcom/vulkan/v3dv_meta_clear.c  |  8 +++--
+ src/broadcom/vulkan/v3dv_pass.c        |  6 ++--
+ src/broadcom/vulkan/v3dv_pipeline.c    |  4 ++-
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c |  7 ++--
+ src/broadcom/vulkan/v3dvx_device.c     |  1 -
+ src/gallium/drivers/v3d/v3d_blit.c     |  2 +-
+ src/gallium/drivers/v3d/v3d_context.c  |  5 +--
+ src/gallium/drivers/v3d/v3d_context.h  |  3 +-
+ src/gallium/drivers/v3d/v3d_job.c      |  6 ++--
+ src/gallium/drivers/v3d/v3d_screen.c   |  3 +-
+ src/gallium/drivers/v3d/v3dx_emit.c    | 14 +++++---
+ src/gallium/drivers/v3d/v3dx_state.c   |  5 +--
+ 19 files changed, 104 insertions(+), 41 deletions(-)
+
+diff --git a/src/broadcom/common/v3d_limits.h b/src/broadcom/common/v3d_limits.h
+index 46f38bd7484..354c8784914 100644
+--- a/src/broadcom/common/v3d_limits.h
++++ b/src/broadcom/common/v3d_limits.h
+@@ -42,7 +42,8 @@
+ 
+ #define V3D_MAX_SAMPLES 4
+ 
+-#define V3D_MAX_DRAW_BUFFERS 4
++#define V3D_MAX_DRAW_BUFFERS 8
++#define V3D_MAX_RENDER_TARGETS(ver) (ver < 71 ? 4 : 8)
+ 
+ #define V3D_MAX_POINT_SIZE 512.0f
+ #define V3D_MAX_LINE_WIDTH 32
+diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
+index 26f5c6b336f..209a5eceaa1 100644
+--- a/src/broadcom/common/v3d_util.c
++++ b/src/broadcom/common/v3d_util.c
+@@ -88,8 +88,10 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+ }
+ 
+ void
+-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+-                     bool msaa, bool double_buffer,
++v3d_choose_tile_size(const struct v3d_device_info *devinfo,
++                     uint32_t color_attachment_count,
++                     uint32_t max_color_bpp, bool msaa,
++                     bool double_buffer,
+                      uint32_t *width, uint32_t *height)
+ {
+    static const uint8_t tile_sizes[] = {
+@@ -103,7 +105,9 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+    };
+ 
+    uint32_t idx = 0;
+-   if (color_attachment_count > 2)
++   if (color_attachment_count > 4)
++      idx += 3;
++   else if (color_attachment_count > 2)
+       idx += 2;
+    else if (color_attachment_count > 1)
+       idx += 1;
+@@ -117,6 +121,45 @@ v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+ 
+    idx += max_color_bpp;
+ 
++   if (devinfo->ver >= 71) {
++      /* In V3D 7.x the TLB has an auxiliary buffer of 8KB that will be
++       * automatically used for depth instead of the main 16KB depth TLB buffer
++       * when the depth tile fits in the auxiliary buffer, allowing the hardware
++       * to allocate the 16KB from the main depth TLB to the color TLB. If
++       * we can do that, then we are effectively doubling the memory we have
++       * for color and we can increase our tile dimensions by a factor of 2
++       * (reduce idx by 1).
++       *
++       * If we have computed a tile size that would be smaller than the minimum
++       * of 8x8, then it is certain that depth will fit in the aux depth TLB
++       * (even in MSAA mode).
++       *
++       * Otherwise, we need check if we can fit depth in the aux TLB buffer
++       * using a larger tile size.
++       *
++       * FIXME: the docs state that depth TLB memory can be used for color
++       * if depth testing is not used by setting the 'depth disable' bit in the
++       * rendering configuration. However, this comes with a requirement that
++       * occlussion queries must not be active. We need to clarify if this means
++       * active at the point at which we emit a tile rendering configuration
++       * item, meaning that the we have a query spanning a full render pass
++       * (this is something we can tell before we emit the rendering
++       * configuration item) or active in the subpass for which we are enabling
++       * the bit (which we can't tell until later, when we record commands for
++       * the subpass). If it is the latter, then we cannot use this feature.
++       */
++      if (idx >= ARRAY_SIZE(tile_sizes) / 2) {
++         idx--;
++      } else if (idx > 0) {
++         /* Depth is always 32bpp (4x32bpp for 4x MSAA) */
++         uint32_t depth_bpp = !msaa ? 4 : 16;
++         uint32_t tile_w = tile_sizes[(idx - 1) * 2];
++         uint32_t tile_h = tile_sizes[(idx - 1) * 2 + 1];
++         if (tile_w * tile_h * depth_bpp <= 8192)
++            idx--;
++      }
++   }
++
+    assert(idx < ARRAY_SIZE(tile_sizes) / 2);
+ 
+    *width = tile_sizes[idx * 2];
+diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
+index 864fc949ffa..5a7e244a0a5 100644
+--- a/src/broadcom/common/v3d_util.h
++++ b/src/broadcom/common/v3d_util.h
+@@ -37,8 +37,10 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+                                          uint32_t wg_size);
+ 
+ void
+-v3d_choose_tile_size(uint32_t color_attachment_count, uint32_t max_color_bpp,
+-                     bool msaa, bool double_buffer,
++v3d_choose_tile_size(const struct v3d_device_info *devinfo,
++                     uint32_t color_attachment_count,
++                     uint32_t max_color_bpp, bool msaa,
++                     bool double_buffer,
+                      uint32_t *width, uint32_t *height);
+ 
+ uint32_t
+diff --git a/src/broadcom/compiler/nir_to_vir.c b/src/broadcom/compiler/nir_to_vir.c
+index a8cf02dd386..531e85a1212 100644
+--- a/src/broadcom/compiler/nir_to_vir.c
++++ b/src/broadcom/compiler/nir_to_vir.c
+@@ -2483,15 +2483,17 @@ ntq_setup_outputs(struct v3d_compile *c)
+ 
+                 switch (var->data.location) {
+                 case FRAG_RESULT_COLOR:
+-                        c->output_color_var[0] = var;
+-                        c->output_color_var[1] = var;
+-                        c->output_color_var[2] = var;
+-                        c->output_color_var[3] = var;
++                        for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
++                                c->output_color_var[i] = var;
+                         break;
+                 case FRAG_RESULT_DATA0:
+                 case FRAG_RESULT_DATA1:
+                 case FRAG_RESULT_DATA2:
+                 case FRAG_RESULT_DATA3:
++                case FRAG_RESULT_DATA4:
++                case FRAG_RESULT_DATA5:
++                case FRAG_RESULT_DATA6:
++                case FRAG_RESULT_DATA7:
+                         c->output_color_var[var->data.location -
+                                             FRAG_RESULT_DATA0] = var;
+                         break;
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index bda0a614523..11d161b19b7 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -365,7 +365,8 @@ job_compute_frame_tiling(struct v3dv_job *job,
+    /* Double-buffer is incompatible with MSAA */
+    assert(!tiling->msaa || !tiling->double_buffer);
+ 
+-   v3d_choose_tile_size(render_target_count, max_internal_bpp,
++   v3d_choose_tile_size(&job->device->devinfo,
++                        render_target_count, max_internal_bpp,
+                         tiling->msaa, tiling->double_buffer,
+                         &tiling->tile_width, &tiling->tile_height);
+ 
+@@ -1374,7 +1375,7 @@ cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
+    }
+ 
+    uint32_t att_count = 0;
+-   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* 4 color + D/S */
++   VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
+ 
+    /* We only need to emit subpass clears as draw calls for color attachments
+     * if the render area is not aligned to tile boundaries.
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index 01e2dd7ac2d..19e58542414 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -1366,6 +1366,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+    const VkSampleCountFlags supported_sample_counts =
+       VK_SAMPLE_COUNT_1_BIT | VK_SAMPLE_COUNT_4_BIT;
+ 
++   const uint8_t max_rts = V3D_MAX_RENDER_TARGETS(pdevice->devinfo.ver);
++
+    struct timespec clock_res;
+    clock_getres(CLOCK_MONOTONIC, &clock_res);
+    const float timestamp_period =
+@@ -1436,7 +1438,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .maxFragmentInputComponents               = max_varying_components,
+       .maxFragmentOutputAttachments             = 4,
+       .maxFragmentDualSrcAttachments            = 0,
+-      .maxFragmentCombinedOutputResources       = MAX_RENDER_TARGETS +
++      .maxFragmentCombinedOutputResources       = max_rts +
+                                                   MAX_STORAGE_BUFFERS +
+                                                   MAX_STORAGE_IMAGES,
+ 
+@@ -1476,7 +1478,7 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .framebufferDepthSampleCounts             = supported_sample_counts,
+       .framebufferStencilSampleCounts           = supported_sample_counts,
+       .framebufferNoAttachmentsSampleCounts     = supported_sample_counts,
+-      .maxColorAttachments                      = MAX_RENDER_TARGETS,
++      .maxColorAttachments                      = max_rts,
+       .sampledImageColorSampleCounts            = supported_sample_counts,
+       .sampledImageIntegerSampleCounts          = supported_sample_counts,
+       .sampledImageDepthSampleCounts            = supported_sample_counts,
+diff --git a/src/broadcom/vulkan/v3dv_limits.h b/src/broadcom/vulkan/v3dv_limits.h
+index 9cda9f0d6d2..8ac99724105 100644
+--- a/src/broadcom/vulkan/v3dv_limits.h
++++ b/src/broadcom/vulkan/v3dv_limits.h
+@@ -50,8 +50,6 @@
+ #define MAX_DYNAMIC_BUFFERS (MAX_DYNAMIC_UNIFORM_BUFFERS + \
+                              MAX_DYNAMIC_STORAGE_BUFFERS)
+ 
+-#define MAX_RENDER_TARGETS 4
+-
+ #define MAX_MULTIVIEW_VIEW_COUNT 16
+ 
+ /* These are tunable parameters in the HW design, but all the V3D
+diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
+index d376c179e1c..0a7905b49d5 100644
+--- a/src/broadcom/vulkan/v3dv_meta_clear.c
++++ b/src/broadcom/vulkan/v3dv_meta_clear.c
+@@ -747,7 +747,7 @@ get_color_clear_pipeline_cache_key(uint32_t rt_idx,
+    uint32_t bit_offset = 0;
+ 
+    key |= rt_idx;
+-   bit_offset += 2;
++   bit_offset += 3;
+ 
+    key |= ((uint64_t) format) << bit_offset;
+    bit_offset += 32;
+@@ -1189,9 +1189,11 @@ v3dv_CmdClearAttachments(VkCommandBuffer commandBuffer,
+ {
+    V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
+ 
+-   /* We can only clear attachments in the current subpass */
+-   assert(attachmentCount <= 5); /* 4 color + D/S */
++   /* We can have at most max_color_RTs + 1 D/S attachments */
++   assert(attachmentCount <=
++          V3D_MAX_RENDER_TARGETS(cmd_buffer->device->devinfo.ver) + 1);
+ 
++   /* We can only clear attachments in the current subpass */
+    struct v3dv_render_pass *pass = cmd_buffer->state.pass;
+ 
+    assert(cmd_buffer->state.subpass_idx < pass->subpass_count);
+diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
+index 3e82c15df88..7f2e2bbc710 100644
+--- a/src/broadcom/vulkan/v3dv_pass.c
++++ b/src/broadcom/vulkan/v3dv_pass.c
+@@ -322,11 +322,11 @@ subpass_get_granularity(struct v3dv_device *device,
+    /* Granularity is defined by the tile size */
+    assert(subpass_idx < pass->subpass_count);
+    struct v3dv_subpass *subpass = &pass->subpasses[subpass_idx];
+-   const uint32_t color_attachment_count = subpass->color_count;
++   const uint32_t color_count = subpass->color_count;
+ 
+    bool msaa = false;
+    uint32_t max_bpp = 0;
+-   for (uint32_t i = 0; i < color_attachment_count; i++) {
++   for (uint32_t i = 0; i < color_count; i++) {
+       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+       if (attachment_idx == VK_ATTACHMENT_UNUSED)
+          continue;
+@@ -349,7 +349,7 @@ subpass_get_granularity(struct v3dv_device *device,
+     * heuristics so we choose a conservative granularity here, with it disabled.
+     */
+    uint32_t width, height;
+-   v3d_choose_tile_size(color_attachment_count, max_bpp, msaa,
++   v3d_choose_tile_size(&device->devinfo, color_count, max_bpp, msaa,
+                         false /* double-buffer */, &width, &height);
+    *granularity = (VkExtent2D) {
+       .width = width,
+diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
+index 2156176d4cc..3bcdcc9a853 100644
+--- a/src/broadcom/vulkan/v3dv_pipeline.c
++++ b/src/broadcom/vulkan/v3dv_pipeline.c
+@@ -2632,6 +2632,7 @@ pipeline_init_dynamic_state(
+    const VkPipelineColorWriteCreateInfoEXT *pColorWriteState)
+ {
+    /* Initialize to default values */
++   const struct v3d_device_info *devinfo = &pipeline->device->devinfo;
+    struct v3dv_dynamic_state *dynamic = &pipeline->dynamic_state;
+    memset(dynamic, 0, sizeof(*dynamic));
+    dynamic->stencil_compare_mask.front = ~0;
+@@ -2639,7 +2640,8 @@ pipeline_init_dynamic_state(
+    dynamic->stencil_write_mask.front = ~0;
+    dynamic->stencil_write_mask.back = ~0;
+    dynamic->line_width = 1.0f;
+-   dynamic->color_write_enable = (1ull << (4 * V3D_MAX_DRAW_BUFFERS)) - 1;
++   dynamic->color_write_enable =
++      (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
+ 
+    /* Create a mask of enabled dynamic states */
+    uint32_t dynamic_states = 0;
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 3566649aafd..bf5e47018e8 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1550,10 +1550,13 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    assert(pipeline);
+ 
++   const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
++   const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
++
+    const uint32_t blend_packets_size =
+       cl_packet_length(BLEND_ENABLES) +
+       cl_packet_length(BLEND_CONSTANT_COLOR) +
+-      cl_packet_length(BLEND_CFG) * V3D_MAX_DRAW_BUFFERS;
++      cl_packet_length(BLEND_CFG) * max_color_rts;
+ 
+    v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
+    v3dv_return_if_oom(cmd_buffer, NULL);
+@@ -1565,7 +1568,7 @@ v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
+          }
+       }
+ 
+-      for (uint32_t i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
++      for (uint32_t i = 0; i < max_color_rts; i++) {
+          if (pipeline->blend.enables & (1 << i))
+             cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
+       }
+diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
+index 72daefadb08..4d17a2691a5 100644
+--- a/src/broadcom/vulkan/v3dvx_device.c
++++ b/src/broadcom/vulkan/v3dvx_device.c
+@@ -49,7 +49,6 @@ vk_to_v3d_compare_func[] = {
+    [VK_COMPARE_OP_ALWAYS]                       = V3D_COMPARE_FUNC_ALWAYS,
+ };
+ 
+-
+ static union pipe_color_union encode_border_color(
+    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+ {
+diff --git a/src/gallium/drivers/v3d/v3d_blit.c b/src/gallium/drivers/v3d/v3d_blit.c
+index 96179f654a4..51ddc292ff7 100644
+--- a/src/gallium/drivers/v3d/v3d_blit.c
++++ b/src/gallium/drivers/v3d/v3d_blit.c
+@@ -369,7 +369,7 @@ v3d_tlb_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
+         bool double_buffer = V3D_DBG(DOUBLE_BUFFER) && !msaa;
+ 
+         uint32_t tile_width, tile_height, max_bpp;
+-        v3d_get_tile_buffer_size(msaa, double_buffer,
++        v3d_get_tile_buffer_size(devinfo, msaa, double_buffer,
+                                  is_color_blit ? 1 : 0, surfaces, src_surf,
+                                  &tile_width, &tile_height, &max_bpp);
+ 
+diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
+index f12e8c92139..def546e9ef5 100644
+--- a/src/gallium/drivers/v3d/v3d_context.c
++++ b/src/gallium/drivers/v3d/v3d_context.c
+@@ -220,7 +220,8 @@ v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
+ }
+ 
+ void
+-v3d_get_tile_buffer_size(bool is_msaa,
++v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
++                         bool is_msaa,
+                          bool double_buffer,
+                          uint32_t nr_cbufs,
+                          struct pipe_surface **cbufs,
+@@ -247,7 +248,7 @@ v3d_get_tile_buffer_size(bool is_msaa,
+                 *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
+         }
+ 
+-        v3d_choose_tile_size(max_cbuf_idx + 1, *max_bpp,
++        v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, *max_bpp,
+                              is_msaa, double_buffer,
+                              tile_width, tile_height);
+ }
+diff --git a/src/gallium/drivers/v3d/v3d_context.h b/src/gallium/drivers/v3d/v3d_context.h
+index 21ee10a90cc..eb184b4b203 100644
+--- a/src/gallium/drivers/v3d/v3d_context.h
++++ b/src/gallium/drivers/v3d/v3d_context.h
+@@ -795,7 +795,8 @@ void v3d_ensure_prim_counts_allocated(struct v3d_context *ctx);
+ void v3d_flag_dirty_sampler_state(struct v3d_context *v3d,
+                                   enum pipe_shader_type shader);
+ 
+-void v3d_get_tile_buffer_size(bool is_msaa,
++void v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
++                              bool is_msaa,
+                               bool double_buffer,
+                               uint32_t nr_cbufs,
+                               struct pipe_surface **cbufs,
+diff --git a/src/gallium/drivers/v3d/v3d_job.c b/src/gallium/drivers/v3d/v3d_job.c
+index b022ed45073..577890a06c3 100644
+--- a/src/gallium/drivers/v3d/v3d_job.c
++++ b/src/gallium/drivers/v3d/v3d_job.c
+@@ -383,9 +383,11 @@ v3d_get_job_for_fbo(struct v3d_context *v3d)
+                 job->double_buffer = false;
+         }
+ 
+-        v3d_get_tile_buffer_size(job->msaa, job->double_buffer,
++        v3d_get_tile_buffer_size(&v3d->screen->devinfo,
++                                 job->msaa, job->double_buffer,
+                                  job->nr_cbufs, job->cbufs, job->bbuf,
+-                                 &job->tile_width, &job->tile_height,
++                                 &job->tile_width,
++                                 &job->tile_height,
+                                  &job->internal_bpp);
+ 
+         /* The dirty flags are tracking what's been updated while v3d->job has
+diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
+index efdb7d615ae..2225edf85bd 100644
+--- a/src/gallium/drivers/v3d/v3d_screen.c
++++ b/src/gallium/drivers/v3d/v3d_screen.c
+@@ -255,9 +255,8 @@ v3d_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
+         case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+                 return V3D_MAX_ARRAY_LAYERS;
+ 
+-                /* Render targets. */
+         case PIPE_CAP_MAX_RENDER_TARGETS:
+-                return 4;
++                return V3D_MAX_RENDER_TARGETS(screen->devinfo.ver);
+ 
+         case PIPE_CAP_VENDOR_ID:
+                 return 0x14E4;
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 75751dc9ab6..87e75281dc9 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -661,8 +661,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                         }
+ #endif
+ 
++                        const uint32_t max_rts =
++                                V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
+                         if (blend->base.independent_blend_enable) {
+-                                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++)
++                                for (int i = 0; i < max_rts; i++)
+                                         emit_rt_blend(v3d, job, &blend->base, i,
+                                                       (1 << i),
+                                                       v3d->blend_dst_alpha_one & (1 << i));
+@@ -678,16 +680,16 @@ v3dX(emit_state)(struct pipe_context *pctx)
+                                  * RTs without.
+                                  */
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
++                                              ((1 << max_rts) - 1) &
+                                                    v3d->blend_dst_alpha_one,
+                                               true);
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              ((1 << V3D_MAX_DRAW_BUFFERS) - 1) &
++                                              ((1 << max_rts) - 1) &
+                                                    ~v3d->blend_dst_alpha_one,
+                                               false);
+                         } else {
+                                 emit_rt_blend(v3d, job, &blend->base, 0,
+-                                              (1 << V3D_MAX_DRAW_BUFFERS) - 1,
++                                              (1 << max_rts) - 1,
+                                               v3d->blend_dst_alpha_one);
+                         }
+                 }
+@@ -696,8 +698,10 @@ v3dX(emit_state)(struct pipe_context *pctx)
+         if (v3d->dirty & V3D_DIRTY_BLEND) {
+                 struct pipe_blend_state *blend = &v3d->blend->base;
+ 
++                const uint32_t max_rts =
++                        V3D_MAX_RENDER_TARGETS(v3d->screen->devinfo.ver);
+                 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+-                        for (int i = 0; i < 4; i++) {
++                        for (int i = 0; i < max_rts; i++) {
+                                 int rt = blend->independent_blend_enable ? i : 0;
+                                 int rt_mask = blend->rt[rt].colormask;
+ 
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 88e57cd072b..970a082aa85 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -138,8 +138,9 @@ v3d_create_blend_state(struct pipe_context *pctx,
+ 
+         so->base = *cso;
+ 
++        uint32_t max_rts = V3D_MAX_RENDER_TARGETS(V3D_VERSION);
+         if (cso->independent_blend_enable) {
+-                for (int i = 0; i < V3D_MAX_DRAW_BUFFERS; i++) {
++                for (int i = 0; i < max_rts; i++) {
+                         so->blend_enables |= cso->rt[i].blend_enable << i;
+ 
+                         /* V3D 4.x is when we got independent blend enables. */
+@@ -148,7 +149,7 @@ v3d_create_blend_state(struct pipe_context *pctx,
+                 }
+         } else {
+                 if (cso->rt[0].blend_enable)
+-                        so->blend_enables = (1 << V3D_MAX_DRAW_BUFFERS) - 1;
++                        so->blend_enables = (1 << max_rts) - 1;
+         }
+ 
+         return so;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch b/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch
new file mode 100644
index 0000000000..2e193e0644
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0124-v3d-v3dv-don-t-use-max-internal-bpp-for-tile-sizing-.patch
@@ -0,0 +1,539 @@
+From cc5afd808039f3e0b81fe0615745b74cbb31d0bf Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 16 Nov 2021 11:26:17 +0100
+Subject: [PATCH 124/142] v3d,v3dv: don't use max internal bpp for tile sizing
+ in V3D 7.x
+
+We can use the actual bpp of each color attachment to compute real
+tile memory requirements, which may allow us to choose a larger tile
+size configuration than in V3D 4.2 in certain scenarios.
+---
+ src/broadcom/common/v3d_util.c          | 112 +++++++++++++++---------
+ src/broadcom/common/v3d_util.h          |   7 +-
+ src/broadcom/vulkan/v3dv_cmd_buffer.c   |  20 +++--
+ src/broadcom/vulkan/v3dv_meta_clear.c   |   1 +
+ src/broadcom/vulkan/v3dv_meta_copy.c    |  19 ++--
+ src/broadcom/vulkan/v3dv_pass.c         |   9 +-
+ src/broadcom/vulkan/v3dv_private.h      |   2 +
+ src/broadcom/vulkan/v3dvx_device.c      |  21 +++--
+ src/broadcom/vulkan/v3dvx_meta_common.c |  10 ++-
+ src/broadcom/vulkan/v3dvx_private.h     |   4 +-
+ src/broadcom/vulkan/v3dvx_queue.c       |   3 +-
+ src/gallium/drivers/v3d/v3d_context.c   |   6 +-
+ 12 files changed, 140 insertions(+), 74 deletions(-)
+
+diff --git a/src/broadcom/common/v3d_util.c b/src/broadcom/common/v3d_util.c
+index 209a5eceaa1..8a50d279985 100644
+--- a/src/broadcom/common/v3d_util.c
++++ b/src/broadcom/common/v3d_util.c
+@@ -87,12 +87,37 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+    return best_wgs_per_sg;
+ }
+ 
++#define V3D71_TLB_COLOR_SIZE     (16 * 1024)
++#define V3D71_TLB_DETPH_SIZE     (16 * 1024)
++#define V3D71_TLB_AUX_DETPH_SIZE  (8 * 1024)
++
++static bool
++tile_size_valid(uint32_t pixel_count, uint32_t color_bpp, uint32_t depth_bpp)
++{
++   /* First, we check if we can fit this tile size allocating the depth
++    * TLB memory to color.
++    */
++   if (pixel_count * depth_bpp <= V3D71_TLB_AUX_DETPH_SIZE &&
++       pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE + V3D71_TLB_DETPH_SIZE) {
++      return true;
++   }
++
++   /* Otherwise the tile must fit in the main TLB buffers */
++   return pixel_count * depth_bpp <= V3D71_TLB_DETPH_SIZE &&
++          pixel_count * color_bpp <= V3D71_TLB_COLOR_SIZE;
++}
++
+ void
+ v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                      uint32_t color_attachment_count,
+-                     uint32_t max_color_bpp, bool msaa,
++                     /* V3D 4.x max internal bpp of all RTs */
++                     uint32_t max_internal_bpp,
++                     /* V3D 7.x accumulated bpp for all RTs (in bytes) */
++                     uint32_t total_color_bpp,
++                     bool msaa,
+                      bool double_buffer,
+-                     uint32_t *width, uint32_t *height)
++                     uint32_t *width,
++                     uint32_t *height)
+ {
+    static const uint8_t tile_sizes[] = {
+       64, 64,
+@@ -105,37 +130,19 @@ v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+    };
+ 
+    uint32_t idx = 0;
+-   if (color_attachment_count > 4)
+-      idx += 3;
+-   else if (color_attachment_count > 2)
+-      idx += 2;
+-   else if (color_attachment_count > 1)
+-      idx += 1;
+-
+-   /* MSAA and double-buffer are mutually exclusive */
+-   assert(!msaa || !double_buffer);
+-   if (msaa)
+-      idx += 2;
+-   else if (double_buffer)
+-      idx += 1;
+-
+-   idx += max_color_bpp;
+-
+    if (devinfo->ver >= 71) {
+-      /* In V3D 7.x the TLB has an auxiliary buffer of 8KB that will be
+-       * automatically used for depth instead of the main 16KB depth TLB buffer
+-       * when the depth tile fits in the auxiliary buffer, allowing the hardware
+-       * to allocate the 16KB from the main depth TLB to the color TLB. If
+-       * we can do that, then we are effectively doubling the memory we have
+-       * for color and we can increase our tile dimensions by a factor of 2
+-       * (reduce idx by 1).
++      /* In V3D 7.x, we use the actual bpp used by color attachments to compute
++       * the tile size instead of the maximum bpp. This may allow us to choose a
++       * larger tile size than we would in 4.x in scenarios with multiple RTs
++       * with different bpps.
+        *
+-       * If we have computed a tile size that would be smaller than the minimum
+-       * of 8x8, then it is certain that depth will fit in the aux depth TLB
+-       * (even in MSAA mode).
+-       *
+-       * Otherwise, we need check if we can fit depth in the aux TLB buffer
+-       * using a larger tile size.
++       * Also, the TLB has an auxiliary buffer of 8KB that will be automatically
++       * used for depth instead of the main 16KB depth TLB buffer when the depth
++       * tile fits in the auxiliary buffer, allowing the hardware to allocate
++       * the 16KB from the main depth TLB to the color TLB. If we can do that,
++       * then we are effectively doubling the memory we have for color and we
++       * can also select a larger tile size. This is necessary to support
++       * the most expensive configuration: 8x128bpp RTs + MSAA.
+        *
+        * FIXME: the docs state that depth TLB memory can be used for color
+        * if depth testing is not used by setting the 'depth disable' bit in the
+@@ -147,17 +154,40 @@ v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+        * configuration item) or active in the subpass for which we are enabling
+        * the bit (which we can't tell until later, when we record commands for
+        * the subpass). If it is the latter, then we cannot use this feature.
++       *
++       * FIXME: pending handling double_buffer.
+        */
+-      if (idx >= ARRAY_SIZE(tile_sizes) / 2) {
+-         idx--;
+-      } else if (idx > 0) {
+-         /* Depth is always 32bpp (4x32bpp for 4x MSAA) */
+-         uint32_t depth_bpp = !msaa ? 4 : 16;
+-         uint32_t tile_w = tile_sizes[(idx - 1) * 2];
+-         uint32_t tile_h = tile_sizes[(idx - 1) * 2 + 1];
+-         if (tile_w * tile_h * depth_bpp <= 8192)
+-            idx--;
+-      }
++      const uint32_t color_bpp = total_color_bpp * (msaa ? 4 : 1);
++      const uint32_t depth_bpp = 4 * (msaa ? 4 : 1);
++      do {
++         const uint32_t tile_w = tile_sizes[idx * 2];
++         const uint32_t tile_h = tile_sizes[idx * 2 + 1];
++         if (tile_size_valid(tile_w * tile_h, color_bpp, depth_bpp))
++            break;
++         idx++;
++      } while (idx < ARRAY_SIZE(tile_sizes) / 2);
++
++      /* FIXME: pending handling double_buffer */
++      assert(!double_buffer);
++   } else {
++      /* On V3D 4.x tile size is selected based on the number of RTs, the
++       * maximum bpp across all of them and whether 4x MSAA is used.
++       */
++      if (color_attachment_count > 4)
++         idx += 3;
++      else if (color_attachment_count > 2)
++         idx += 2;
++      else if (color_attachment_count > 1)
++         idx += 1;
++
++      /* MSAA and double-buffer are mutually exclusive */
++      assert(!msaa || !double_buffer);
++      if (msaa)
++         idx += 2;
++      else if (double_buffer)
++         idx += 1;
++
++      idx += max_internal_bpp;
+    }
+ 
+    assert(idx < ARRAY_SIZE(tile_sizes) / 2);
+diff --git a/src/broadcom/common/v3d_util.h b/src/broadcom/common/v3d_util.h
+index 5a7e244a0a5..d02d41dd089 100644
+--- a/src/broadcom/common/v3d_util.h
++++ b/src/broadcom/common/v3d_util.h
+@@ -39,9 +39,12 @@ v3d_csd_choose_workgroups_per_supergroup(struct v3d_device_info *devinfo,
+ void
+ v3d_choose_tile_size(const struct v3d_device_info *devinfo,
+                      uint32_t color_attachment_count,
+-                     uint32_t max_color_bpp, bool msaa,
++                     uint32_t max_internal_bpp,
++                     uint32_t total_color_bpp,
++                     bool msaa,
+                      bool double_buffer,
+-                     uint32_t *width, uint32_t *height);
++                     uint32_t *width,
++                     uint32_t *height);
+ 
+ uint32_t
+ v3d_translate_pipe_swizzle(enum pipe_swizzle swizzle);
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index 11d161b19b7..f65388c10ec 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -348,6 +348,7 @@ job_compute_frame_tiling(struct v3dv_job *job,
+                          uint32_t layers,
+                          uint32_t render_target_count,
+                          uint8_t max_internal_bpp,
++                         uint8_t total_color_bpp,
+                          bool msaa,
+                          bool double_buffer)
+ {
+@@ -360,14 +361,16 @@ job_compute_frame_tiling(struct v3dv_job *job,
+    tiling->render_target_count = render_target_count;
+    tiling->msaa = msaa;
+    tiling->internal_bpp = max_internal_bpp;
++   tiling->total_color_bpp = total_color_bpp;
+    tiling->double_buffer = double_buffer;
+ 
+    /* Double-buffer is incompatible with MSAA */
+    assert(!tiling->msaa || !tiling->double_buffer);
+ 
+    v3d_choose_tile_size(&job->device->devinfo,
+-                        render_target_count, max_internal_bpp,
+-                        tiling->msaa, tiling->double_buffer,
++                        render_target_count,
++                        max_internal_bpp, total_color_bpp, msaa,
++                        tiling->double_buffer,
+                         &tiling->tile_width, &tiling->tile_height);
+ 
+    tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
+@@ -458,6 +461,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
+                      bool allocate_tile_state_now,
+                      uint32_t render_target_count,
+                      uint8_t max_internal_bpp,
++                     uint8_t total_color_bpp,
+                      bool msaa)
+ {
+    assert(job);
+@@ -468,7 +472,7 @@ v3dv_job_start_frame(struct v3dv_job *job,
+    const struct v3dv_frame_tiling *tiling =
+       job_compute_frame_tiling(job, width, height, layers,
+                                render_target_count, max_internal_bpp,
+-                               msaa, false);
++                               total_color_bpp, msaa, false);
+ 
+    v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
+    v3dv_return_if_oom(NULL, job);
+@@ -529,6 +533,7 @@ cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
+                                job->frame_tiling.layers,
+                                job->frame_tiling.render_target_count,
+                                job->frame_tiling.internal_bpp,
++                               job->frame_tiling.total_color_bpp,
+                                job->frame_tiling.msaa,
+                                true);
+ 
+@@ -1673,10 +1678,11 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+       const struct v3dv_framebuffer *framebuffer = state->framebuffer;
+ 
+-      uint8_t internal_bpp;
++      uint8_t max_internal_bpp, total_color_bpp;
+       bool msaa;
+       v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
+-         (framebuffer, state->attachments, subpass, &internal_bpp, &msaa);
++         (framebuffer, state->attachments, subpass,
++          &max_internal_bpp, &total_color_bpp, &msaa);
+ 
+       /* From the Vulkan spec:
+        *
+@@ -1700,7 +1706,8 @@ cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
+                            layers,
+                            true, false,
+                            subpass->color_count,
+-                           internal_bpp,
++                           max_internal_bpp,
++                           total_color_bpp,
+                            msaa);
+    }
+ 
+@@ -2668,6 +2675,7 @@ cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
+                         true, false,
+                         old_job->frame_tiling.render_target_count,
+                         old_job->frame_tiling.internal_bpp,
++                        old_job->frame_tiling.total_color_bpp,
+                         true /* msaa */);
+ 
+    v3dv_job_destroy(old_job);
+diff --git a/src/broadcom/vulkan/v3dv_meta_clear.c b/src/broadcom/vulkan/v3dv_meta_clear.c
+index 0a7905b49d5..1c0d66c977c 100644
+--- a/src/broadcom/vulkan/v3dv_meta_clear.c
++++ b/src/broadcom/vulkan/v3dv_meta_clear.c
+@@ -127,6 +127,7 @@ clear_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+ 
+       v3dv_job_start_frame(job, width, height, max_layer,
+                            false, true, 1, internal_bpp,
++                           4 * v3d_internal_bpp_words(internal_bpp),
+                            image->vk.samples > VK_SAMPLE_COUNT_1_BIT);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+diff --git a/src/broadcom/vulkan/v3dv_meta_copy.c b/src/broadcom/vulkan/v3dv_meta_copy.c
+index c0ec888b8c7..2d30c611e17 100644
+--- a/src/broadcom/vulkan/v3dv_meta_copy.c
++++ b/src/broadcom/vulkan/v3dv_meta_copy.c
+@@ -453,8 +453,9 @@ copy_image_to_buffer_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, false);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        false);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+@@ -1323,8 +1324,8 @@ copy_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->extent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->extent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers,
+-                        false, true, 1, internal_bpp,
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
+                         src->vk.samples > VK_SAMPLE_COUNT_1_BIT);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+@@ -1978,8 +1979,9 @@ copy_buffer_to_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+    const uint32_t width = DIV_ROUND_UP(region->imageExtent.width, block_w);
+    const uint32_t height = DIV_ROUND_UP(region->imageExtent.height, block_h);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, false);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        false);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+@@ -4884,8 +4886,9 @@ resolve_image_tlb(struct v3dv_cmd_buffer *cmd_buffer,
+       (fb_format, region->srcSubresource.aspectMask,
+        &internal_type, &internal_bpp);
+ 
+-   v3dv_job_start_frame(job, width, height, num_layers, false, true,
+-                        1, internal_bpp, true);
++   v3dv_job_start_frame(job, width, height, num_layers, false, true, 1,
++                        internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                        true);
+ 
+    struct v3dv_meta_framebuffer framebuffer;
+    v3dv_X(job->device, meta_framebuffer_init)(&framebuffer, fb_format,
+diff --git a/src/broadcom/vulkan/v3dv_pass.c b/src/broadcom/vulkan/v3dv_pass.c
+index 7f2e2bbc710..0583faf6f9a 100644
+--- a/src/broadcom/vulkan/v3dv_pass.c
++++ b/src/broadcom/vulkan/v3dv_pass.c
+@@ -325,7 +325,8 @@ subpass_get_granularity(struct v3dv_device *device,
+    const uint32_t color_count = subpass->color_count;
+ 
+    bool msaa = false;
+-   uint32_t max_bpp = 0;
++   uint32_t max_internal_bpp = 0;
++   uint32_t total_color_bpp = 0;
+    for (uint32_t i = 0; i < color_count; i++) {
+       uint32_t attachment_idx = subpass->color_attachments[i].attachment;
+       if (attachment_idx == VK_ATTACHMENT_UNUSED)
+@@ -339,7 +340,8 @@ subpass_get_granularity(struct v3dv_device *device,
+       v3dv_X(device, get_internal_type_bpp_for_output_format)
+          (format->planes[0].rt_type, &internal_type, &internal_bpp);
+ 
+-      max_bpp = MAX2(max_bpp, internal_bpp);
++      max_internal_bpp = MAX2(max_internal_bpp, internal_bpp);
++      total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
+ 
+       if (desc->samples > VK_SAMPLE_COUNT_1_BIT)
+          msaa = true;
+@@ -349,7 +351,8 @@ subpass_get_granularity(struct v3dv_device *device,
+     * heuristics so we choose a conservative granularity here, with it disabled.
+     */
+    uint32_t width, height;
+-   v3d_choose_tile_size(&device->devinfo, color_count, max_bpp, msaa,
++   v3d_choose_tile_size(&device->devinfo, color_count,
++                        max_internal_bpp, total_color_bpp, msaa,
+                         false /* double-buffer */, &width, &height);
+    *granularity = (VkExtent2D) {
+       .width = width,
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index 300a1ec8ae1..9375cdd58c0 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -950,6 +950,7 @@ struct v3dv_frame_tiling {
+    uint32_t layers;
+    uint32_t render_target_count;
+    uint32_t internal_bpp;
++   uint32_t total_color_bpp;
+    bool     msaa;
+    bool     double_buffer;
+    uint32_t tile_width;
+@@ -1373,6 +1374,7 @@ void v3dv_job_start_frame(struct v3dv_job *job,
+                           bool allocate_tile_state_now,
+                           uint32_t render_target_count,
+                           uint8_t max_internal_bpp,
++                          uint8_t total_color_bpp,
+                           bool msaa);
+ 
+ bool v3dv_job_type_is_gpu(struct v3dv_job *job);
+diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
+index 4d17a2691a5..61ad98c1217 100644
+--- a/src/broadcom/vulkan/v3dvx_device.c
++++ b/src/broadcom/vulkan/v3dvx_device.c
+@@ -257,11 +257,13 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+    const struct v3dv_framebuffer *framebuffer,
+    const struct v3dv_cmd_buffer_attachment_state *attachments,
+    const struct v3dv_subpass *subpass,
+-   uint8_t *max_bpp,
++   uint8_t *max_internal_bpp,
++   uint8_t *total_color_bpp,
+    bool *msaa)
+ {
+    STATIC_ASSERT(V3D_INTERNAL_BPP_32 == 0);
+-   *max_bpp = V3D_INTERNAL_BPP_32;
++   *max_internal_bpp = V3D_INTERNAL_BPP_32;
++   *total_color_bpp = 0;
+    *msaa = false;
+ 
+    if (subpass) {
+@@ -274,8 +276,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+          assert(att);
+          assert(att->plane_count == 1);
+ 
+-         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+-            *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
++         if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
++            const uint32_t internal_bpp = att->planes[0].internal_bpp;
++            *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
++            *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
++         }
+ 
+          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+             *msaa = true;
+@@ -289,7 +294,6 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+          if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+             *msaa = true;
+       }
+-
+       return;
+    }
+ 
+@@ -299,8 +303,11 @@ v3dX(framebuffer_compute_internal_bpp_msaa)(
+       assert(att);
+       assert(att->plane_count == 1);
+ 
+-      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT)
+-         *max_bpp = MAX2(*max_bpp, att->planes[0].internal_bpp);
++      if (att->vk.aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
++         const uint32_t internal_bpp = att->planes[0].internal_bpp;
++         *max_internal_bpp = MAX2(*max_internal_bpp, internal_bpp);
++         *total_color_bpp += 4 * v3d_internal_bpp_words(internal_bpp);
++      }
+ 
+       if (att->vk.image->samples > VK_SAMPLE_COUNT_1_BIT)
+          *msaa = true;
+diff --git a/src/broadcom/vulkan/v3dvx_meta_common.c b/src/broadcom/vulkan/v3dvx_meta_common.c
+index b8f3297bc94..858096f9e4b 100644
+--- a/src/broadcom/vulkan/v3dvx_meta_common.c
++++ b/src/broadcom/vulkan/v3dvx_meta_common.c
+@@ -1408,8 +1408,9 @@ v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+       uint32_t width, height;
+       framebuffer_size_for_pixel_count(num_items, &width, &height);
+ 
+-      v3dv_job_start_frame(job, width, height, 1, true, true,
+-                           1, internal_bpp, false);
++      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
++                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                           false);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
+@@ -1455,8 +1456,9 @@ v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
+       uint32_t width, height;
+       framebuffer_size_for_pixel_count(num_items, &width, &height);
+ 
+-      v3dv_job_start_frame(job, width, height, 1, true, true,
+-                           1, internal_bpp, false);
++      v3dv_job_start_frame(job, width, height, 1, true, true, 1,
++                           internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
++                           false);
+ 
+       struct v3dv_meta_framebuffer framebuffer;
+       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index 81715520913..709b129926f 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -136,7 +136,9 @@ void
+ v3dX(framebuffer_compute_internal_bpp_msaa)(const struct v3dv_framebuffer *framebuffer,
+                                             const struct v3dv_cmd_buffer_attachment_state *attachments,
+                                             const struct v3dv_subpass *subpass,
+-                                            uint8_t *max_bpp, bool *msaa);
++                                            uint8_t *max_internal_bpp,
++                                            uint8_t *total_color_bpp,
++                                            bool *msaa);
+ 
+ #ifdef DEBUG
+ void
+diff --git a/src/broadcom/vulkan/v3dvx_queue.c b/src/broadcom/vulkan/v3dvx_queue.c
+index f8cee36e3bf..6eed2de9d54 100644
+--- a/src/broadcom/vulkan/v3dvx_queue.c
++++ b/src/broadcom/vulkan/v3dvx_queue.c
+@@ -29,7 +29,8 @@
+ void
+ v3dX(job_emit_noop)(struct v3dv_job *job)
+ {
+-   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1, V3D_INTERNAL_BPP_32, false);
++   v3dv_job_start_frame(job, 1, 1, 1, true, true, 1,
++                        V3D_INTERNAL_BPP_32, 4, false);
+    v3dX(job_emit_binning_flush)(job);
+ 
+    struct v3dv_cl *rcl = &job->rcl;
+diff --git a/src/gallium/drivers/v3d/v3d_context.c b/src/gallium/drivers/v3d/v3d_context.c
+index def546e9ef5..1dc4bd017fe 100644
+--- a/src/gallium/drivers/v3d/v3d_context.c
++++ b/src/gallium/drivers/v3d/v3d_context.c
+@@ -233,11 +233,13 @@ v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+         assert(!is_msaa || !double_buffer);
+ 
+         uint32_t max_cbuf_idx = 0;
++        uint32_t total_bpp = 0;
+         *max_bpp = 0;
+         for (int i = 0; i < nr_cbufs; i++) {
+                 if (cbufs[i]) {
+                         struct v3d_surface *surf = v3d_surface(cbufs[i]);
+                         *max_bpp = MAX2(*max_bpp, surf->internal_bpp);
++                        total_bpp += 4 * v3d_internal_bpp_words(surf->internal_bpp);
+                         max_cbuf_idx = MAX2(i, max_cbuf_idx);
+                 }
+         }
+@@ -246,9 +248,11 @@ v3d_get_tile_buffer_size(const struct v3d_device_info *devinfo,
+                 struct v3d_surface *bsurf = v3d_surface(bbuf);
+                 assert(bbuf->texture->nr_samples <= 1 || is_msaa);
+                 *max_bpp = MAX2(*max_bpp, bsurf->internal_bpp);
++                total_bpp += 4 * v3d_internal_bpp_words(bsurf->internal_bpp);
+         }
+ 
+-        v3d_choose_tile_size(devinfo, max_cbuf_idx + 1, *max_bpp,
++        v3d_choose_tile_size(devinfo, max_cbuf_idx + 1,
++                             *max_bpp, total_bpp,
+                              is_msaa, double_buffer,
+                              tile_width, tile_height);
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch b/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch
new file mode 100644
index 0000000000..c03e043b90
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0125-v3dv-implement-depthBounds-support-for-v71.patch
@@ -0,0 +1,241 @@
+From 210338b6b1b030d36acaebad504ed2bec4a2cd74 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Fri, 19 Nov 2021 10:51:37 +0100
+Subject: [PATCH 125/142] v3dv: implement depthBounds support for v71
+
+Just for for v71, as that feature is not supported by older hw.
+---
+ src/broadcom/vulkan/v3dv_cmd_buffer.c  | 19 ++++++++++++---
+ src/broadcom/vulkan/v3dv_device.c      |  2 +-
+ src/broadcom/vulkan/v3dv_pipeline.c    | 17 ++++++++------
+ src/broadcom/vulkan/v3dv_private.h     | 12 +++++++++-
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 32 ++++++++++++++++++++++++++
+ src/broadcom/vulkan/v3dvx_pipeline.c   |  3 +++
+ src/broadcom/vulkan/v3dvx_private.h    |  3 +++
+ 7 files changed, 76 insertions(+), 12 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index f65388c10ec..36bd7960985 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -2070,6 +2070,14 @@ cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
+       }
+    }
+ 
++   if (!(dynamic_mask & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
++      if (memcmp(&dest->depth_bounds, &src->depth_bounds,
++                 sizeof(src->depth_bounds))) {
++         memcpy(&dest->depth_bounds, &src->depth_bounds, sizeof(src->depth_bounds));
++         dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
++      }
++   }
++
+    if (!(dynamic_mask & V3DV_DYNAMIC_LINE_WIDTH)) {
+       if (dest->line_width != src->line_width) {
+          dest->line_width = src->line_width;
+@@ -2940,6 +2948,9 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
+    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_DEPTH_BIAS))
+       v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
+ 
++   if (*dirty & V3DV_CMD_DIRTY_DEPTH_BOUNDS)
++      v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
++
+    if (*dirty & (V3DV_CMD_DIRTY_PIPELINE | V3DV_CMD_DIRTY_BLEND_CONSTANTS))
+       v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
+ 
+@@ -3369,9 +3380,11 @@ v3dv_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
+                        float minDepthBounds,
+                        float maxDepthBounds)
+ {
+-   /* We do not support depth bounds testing so we just ignore this. We are
+-    * already asserting that pipelines don't enable the feature anyway.
+-    */
++   V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
++
++   cmd_buffer->state.dynamic.depth_bounds.min = minDepthBounds;
++   cmd_buffer->state.dynamic.depth_bounds.max = maxDepthBounds;
++   cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DEPTH_BOUNDS;
+ }
+ 
+ VKAPI_ATTR void VKAPI_CALL
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index 19e58542414..1de9b5ce683 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -227,7 +227,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+       .depthClamp = false, /* Only available since V3D 4.5.1.1 */
+       .depthBiasClamp = true,
+       .fillModeNonSolid = true,
+-      .depthBounds = false, /* Only available since V3D 4.3.16.2 */
++      .depthBounds = physical_device->devinfo.ver >= 71,
+       .wideLines = true,
+       .largePoints = true,
+       .alphaToOne = true,
+diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c
+index 3bcdcc9a853..ba782b8268a 100644
+--- a/src/broadcom/vulkan/v3dv_pipeline.c
++++ b/src/broadcom/vulkan/v3dv_pipeline.c
+@@ -2608,13 +2608,8 @@ v3dv_dynamic_state_mask(VkDynamicState state)
+       return V3DV_DYNAMIC_LINE_WIDTH;
+    case VK_DYNAMIC_STATE_COLOR_WRITE_ENABLE_EXT:
+       return V3DV_DYNAMIC_COLOR_WRITE_ENABLE;
+-
+-   /* Depth bounds testing is not available in in V3D 4.2 so here we are just
+-    * ignoring this dynamic state. We are already asserting at pipeline creation
+-    * time that depth bounds testing is not enabled.
+-    */
+    case VK_DYNAMIC_STATE_DEPTH_BOUNDS:
+-      return 0;
++      return V3DV_DYNAMIC_DEPTH_BOUNDS;
+ 
+    default:
+       unreachable("Unhandled dynamic state");
+@@ -2642,6 +2637,7 @@ pipeline_init_dynamic_state(
+    dynamic->line_width = 1.0f;
+    dynamic->color_write_enable =
+       (1ull << (4 * V3D_MAX_RENDER_TARGETS(devinfo->ver))) - 1;
++   dynamic->depth_bounds.max = 1.0f;
+ 
+    /* Create a mask of enabled dynamic states */
+    uint32_t dynamic_states = 0;
+@@ -2694,6 +2690,11 @@ pipeline_init_dynamic_state(
+          dynamic->stencil_reference.front = pDepthStencilState->front.reference;
+          dynamic->stencil_reference.back = pDepthStencilState->back.reference;
+       }
++
++      if (!(dynamic_states & V3DV_DYNAMIC_DEPTH_BOUNDS)) {
++         dynamic->depth_bounds.min = pDepthStencilState->minDepthBounds;
++         dynamic->depth_bounds.max = pDepthStencilState->maxDepthBounds;
++      }
+    }
+ 
+    if (pColorBlendState && !(dynamic_states & V3DV_DYNAMIC_BLEND_CONSTANTS)) {
+@@ -2907,7 +2908,9 @@ pipeline_init(struct v3dv_pipeline *pipeline,
+    /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
+     * feature and it shouldn't be used by any pipeline.
+     */
+-   assert(!ds_info || !ds_info->depthBoundsTestEnable);
++   assert(device->devinfo.ver >= 71 ||
++          !ds_info || !ds_info->depthBoundsTestEnable);
++   pipeline->depth_bounds_test_enabled = ds_info && ds_info->depthBoundsTestEnable;
+ 
+    enable_depth_bias(pipeline, rs_info);
+ 
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index 9375cdd58c0..a074e0a981c 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -1045,7 +1045,8 @@ enum v3dv_dynamic_state_bits {
+    V3DV_DYNAMIC_DEPTH_BIAS                = 1 << 6,
+    V3DV_DYNAMIC_LINE_WIDTH                = 1 << 7,
+    V3DV_DYNAMIC_COLOR_WRITE_ENABLE        = 1 << 8,
+-   V3DV_DYNAMIC_ALL                       = (1 << 9) - 1,
++   V3DV_DYNAMIC_DEPTH_BOUNDS              = 1 << 9,
++   V3DV_DYNAMIC_ALL                       = (1 << 10) - 1,
+ };
+ 
+ /* Flags for dirty pipeline state.
+@@ -1070,6 +1071,7 @@ enum v3dv_cmd_dirty_bits {
+    V3DV_CMD_DIRTY_LINE_WIDTH                = 1 << 16,
+    V3DV_CMD_DIRTY_VIEW_INDEX                = 1 << 17,
+    V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE        = 1 << 18,
++   V3DV_CMD_DIRTY_DEPTH_BOUNDS              = 1 << 19,
+ };
+ 
+ struct v3dv_dynamic_state {
+@@ -1106,6 +1108,11 @@ struct v3dv_dynamic_state {
+       float slope_factor;
+    } depth_bias;
+ 
++   struct {
++      float                                     min;
++      float                                     max;
++   } depth_bounds;
++
+    float line_width;
+ 
+    uint32_t color_write_enable;
+@@ -2333,6 +2340,9 @@ struct v3dv_pipeline {
+       bool is_z16;
+    } depth_bias;
+ 
++   /* Depth bounds */
++   bool depth_bounds_test_enabled;
++
+    struct {
+       void *mem_ctx;
+       struct util_dynarray data; /* Array of v3dv_pipeline_executable_data */
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index bf5e47018e8..9307a6e9d93 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1507,6 +1507,38 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
+    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BIAS;
+ }
+ 
++void
++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
++{
++   /* No depthBounds support for v42, so this method is empty on that case.
++    *
++    * Note that this method is being called as v3dv_job_init flag all state as
++    * dirty. See FIXME note at v3dv_job_init.
++    */
++
++#if V3D_VERSION >= 71
++   struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
++   assert(pipeline);
++
++   if (!pipeline->depth_bounds_test_enabled)
++      return;
++
++   struct v3dv_job *job = cmd_buffer->state.job;
++   assert(job);
++
++   v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
++   v3dv_return_if_oom(cmd_buffer, NULL);
++
++   struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
++   cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
++      bounds.lower_test_limit = dynamic->depth_bounds.min;
++      bounds.upper_test_limit = dynamic->depth_bounds.max;
++   }
++
++   cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DEPTH_BOUNDS;
++#endif
++}
++
+ void
+ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
+ {
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 7b1133f8173..83ab2f19e4f 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -259,6 +259,9 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+       } else {
+          config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+       }
++
++      config.depth_bounds_test_enable =
++              ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+ #endif
+    };
+ }
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index 709b129926f..1ce4789c5ac 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -54,6 +54,9 @@ v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer);
+ void
+ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer);
+ 
++void
++v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer);
++
+ void
+ v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer);
+ 
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch b/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch
new file mode 100644
index 0000000000..e59c0e1890
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0126-v3d-v3dv-propagate-NaNs-bits-in-shader-state-records.patch
@@ -0,0 +1,119 @@
+From be6508ffef8c0e9fbc47175739db80a3eeff2cdb Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Fri, 3 Dec 2021 13:20:22 +0100
+Subject: [PATCH 126/142] v3d,v3dv: propagate NaNs bits in shader state records
+ are reserved in v7.x
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c |  4 ++++
+ src/broadcom/vulkan/v3dvx_pipeline.c   | 10 +++++-----
+ src/gallium/drivers/v3d/v3dx_draw.c    | 14 +++++++++-----
+ 3 files changed, 18 insertions(+), 10 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 9307a6e9d93..580aeb8ba2b 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -2175,7 +2175,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
+          gs_bin->prog_data.gs->base.threads == 4;
+       shader.geometry_bin_mode_shader_start_in_final_thread_section =
+          gs_bin->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+       shader.geometry_bin_mode_shader_propagate_nans = true;
++#endif
+       shader.geometry_bin_mode_shader_uniforms_address =
+          gs_bin_uniforms;
+ 
+@@ -2185,7 +2187,9 @@ emit_gs_shader_state_record(struct v3dv_job *job,
+          gs->prog_data.gs->base.threads == 4;
+       shader.geometry_render_mode_shader_start_in_final_thread_section =
+          gs->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+       shader.geometry_render_mode_shader_propagate_nans = true;
++#endif
+       shader.geometry_render_mode_shader_uniforms_address =
+          gs_render_uniforms;
+    }
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index 83ab2f19e4f..c9b537f4b32 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -471,19 +471,19 @@ pack_shader_state_record(struct v3dv_pipeline *pipeline)
+       shader.number_of_varyings_in_fragment_shader =
+          prog_data_fs->num_inputs;
+ 
+-      shader.coordinate_shader_propagate_nans = true;
+-      shader.vertex_shader_propagate_nans = true;
+-      shader.fragment_shader_propagate_nans = true;
+-
+       /* Note: see previous note about addresses */
+       /* shader.coordinate_shader_code_address */
+       /* shader.vertex_shader_code_address */
+       /* shader.fragment_shader_code_address */
+ 
++#if V3D_VERSION == 42
++      shader.coordinate_shader_propagate_nans = true;
++      shader.vertex_shader_propagate_nans = true;
++      shader.fragment_shader_propagate_nans = true;
++
+       /* FIXME: Use combined input/output size flag in the common case (also
+        * on v3d, see v3dx_draw).
+        */
+-#if V3D_VERSION == 42
+       shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
+          prog_data_vs_bin->separate_segments;
+       shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index 04cc3bc3ae1..e4b414b0676 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -396,7 +396,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
+                         gs_bin->prog_data.gs->base.threads == 4;
+                 shader.geometry_bin_mode_shader_start_in_final_thread_section =
+                         gs_bin->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+                 shader.geometry_bin_mode_shader_propagate_nans = true;
++#endif
+                 shader.geometry_bin_mode_shader_uniforms_address =
+                         gs_bin_uniforms;
+ 
+@@ -406,7 +408,9 @@ v3d_emit_gs_state_record(struct v3d_job *job,
+                         gs->prog_data.gs->base.threads == 4;
+                 shader.geometry_render_mode_shader_start_in_final_thread_section =
+                         gs->prog_data.gs->base.single_seg;
++#if V3D_VERSION <= 42
+                 shader.geometry_render_mode_shader_propagate_nans = true;
++#endif
+                 shader.geometry_render_mode_shader_uniforms_address =
+                         gs_render_uniforms;
+         }
+@@ -657,10 +661,6 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                 shader.number_of_varyings_in_fragment_shader =
+                         v3d->prog.fs->prog_data.fs->num_inputs;
+ 
+-                shader.coordinate_shader_propagate_nans = true;
+-                shader.vertex_shader_propagate_nans = true;
+-                shader.fragment_shader_propagate_nans = true;
+-
+                 shader.coordinate_shader_code_address =
+                         cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
+                                    v3d->prog.cs->offset);
+@@ -671,10 +671,14 @@ v3d_emit_gl_shader_state(struct v3d_context *v3d,
+                         cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
+                                    v3d->prog.fs->offset);
+ 
++#if V3D_VERSION <= 42
++                shader.coordinate_shader_propagate_nans = true;
++                shader.vertex_shader_propagate_nans = true;
++                shader.fragment_shader_propagate_nans = true;
++
+                 /* XXX: Use combined input/output size flag in the common
+                  * case.
+                  */
+-#if V3D_VERSION <= 42
+                 shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
+                         v3d->prog.cs->prog_data.vs->separate_segments;
+                 shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch b/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch
new file mode 100644
index 0000000000..81357ea2f9
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0127-v3dv-use-new-texture-shader-state-rb_swap-and-revers.patch
@@ -0,0 +1,296 @@
+From c74ba2b39e7b9fe6c5415c20c98cd231d2674df6 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Tue, 16 May 2023 00:38:40 +0200
+Subject: [PATCH 127/142] v3dv: use new texture shader state rb_swap and
+ reverse fields in v3d 7.x
+
+In v3d 4.x we handle formats that are reversed or R/B swapped by
+applying a format swizzle. This doesn't work on border colors though,
+and for that there is a specific bit to reverse the border color in
+the texture shader state.
+
+In v3d 7.x we have new reverse and swap R/B bits and we no longer have
+a bit to reverse the border color because the new reverse bit applies
+to border texels too. Because of this, we absolutely need to use these
+new bits in order to get correct border colors in all cases with these
+formats.
+
+When we enable the reverse and/or swap R/B bits, we are effectively
+applying the format swizzle through them, so in these cases we need to
+make sure the swizzle we program in the texture shader state is the
+view swizzle provided by the API and not the composition of the format
+swizzle with the view swizzle like we do in 4.x for all formats. The
+same applies to custom border colors: we must not apply the format
+swizzle to them for formats that are reversed or R/B swapped, because
+again, this format swizzle is already applied through these new bits.
+
+While we are doing this, we also fully adopt the texture shader state
+spec from v3d 7.1.5 for v3d 7.x instead of using a description from
+7.1.2 which is incompatible and required the driver to manually pack
+some of the bits.
+---
+ src/broadcom/vulkan/v3dv_device.c    |  2 +-
+ src/broadcom/vulkan/v3dv_image.c     |  7 ++--
+ src/broadcom/vulkan/v3dv_private.h   | 13 ++++++-
+ src/broadcom/vulkan/v3dvx_device.c   | 24 ++++++++++--
+ src/broadcom/vulkan/v3dvx_image.c    | 56 ++++++++++++++++++----------
+ src/broadcom/vulkan/v3dvx_private.h  |  3 +-
+ src/gallium/drivers/v3d/v3dx_state.c |  6 ---
+ 7 files changed, 75 insertions(+), 36 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index 1de9b5ce683..b520bfa0002 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -2989,7 +2989,7 @@ v3dv_CreateSampler(VkDevice _device,
+       }
+    }
+ 
+-   v3dv_X(device, pack_sampler_state)(sampler, pCreateInfo, bc_info);
++   v3dv_X(device, pack_sampler_state)(device, sampler, pCreateInfo, bc_info);
+ 
+    *pSampler = v3dv_sampler_to_handle(sampler);
+ 
+diff --git a/src/broadcom/vulkan/v3dv_image.c b/src/broadcom/vulkan/v3dv_image.c
+index ebbd60e4c03..e01e2e1bd19 100644
+--- a/src/broadcom/vulkan/v3dv_image.c
++++ b/src/broadcom/vulkan/v3dv_image.c
+@@ -671,7 +671,6 @@ create_image_view(struct v3dv_device *device,
+     * makes sense to implement swizzle composition using VkSwizzle directly.
+     */
+    VkFormat format;
+-   uint8_t image_view_swizzle[4];
+    if (pCreateInfo->format == VK_FORMAT_D24_UNORM_S8_UINT &&
+        range->aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
+       format = VK_FORMAT_R8G8B8A8_UINT;
+@@ -682,11 +681,11 @@ create_image_view(struct v3dv_device *device,
+       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle, view_swizzle);
+ 
+       util_format_compose_swizzles(stencil_aspect_swizzle, view_swizzle,
+-                                   image_view_swizzle);
++                                   iview->view_swizzle);
+    } else {
+       format = pCreateInfo->format;
+       vk_component_mapping_to_pipe_swizzle(iview->vk.swizzle,
+-                                           image_view_swizzle);
++                                           iview->view_swizzle);
+    }
+ 
+    iview->vk.view_format = format;
+@@ -711,7 +710,7 @@ create_image_view(struct v3dv_device *device,
+ 
+       const uint8_t *format_swizzle =
+          v3dv_get_format_swizzle(device, format, plane);
+-      util_format_compose_swizzles(format_swizzle, image_view_swizzle,
++      util_format_compose_swizzles(format_swizzle, iview->view_swizzle,
+                                    iview->planes[plane].swizzle);
+ 
+       iview->planes[plane].swap_rb = v3dv_format_swizzle_needs_rb_swap(format_swizzle);
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index a074e0a981c..8adb8873efd 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -776,6 +776,8 @@ struct v3dv_image_view {
+ 
+    const struct v3dv_format *format;
+ 
++   uint8_t view_swizzle[4];
++
+    uint8_t plane_count;
+    struct {
+       uint8_t image_plane;
+@@ -786,8 +788,8 @@ struct v3dv_image_view {
+       uint32_t internal_type;
+       uint32_t offset;
+ 
+-      /* Precomputed (composed from createinfo->components and formar swizzle)
+-       * swizzles to pass in to the shader key.
++      /* Precomputed swizzle (composed from the view swizzle and the format
++       * swizzle).
+        *
+        * This could be also included on the descriptor bo, but the shader state
+        * packet doesn't need it on a bo, so we can just avoid a memory copy
+@@ -2358,6 +2360,13 @@ struct v3dv_pipeline {
+    uint8_t stencil_cfg[2][V3DV_STENCIL_CFG_LENGTH];
+ };
+ 
++static inline bool
++v3dv_texture_shader_state_has_rb_swap_reverse_bits(const struct v3dv_device *device)
++{
++   return device->devinfo.ver > 71 ||
++          (device->devinfo.ver == 71 && device->devinfo.rev >= 5);
++}
++
+ static inline VkPipelineBindPoint
+ v3dv_pipeline_get_binding_point(struct v3dv_pipeline *pipeline)
+ {
+diff --git a/src/broadcom/vulkan/v3dvx_device.c b/src/broadcom/vulkan/v3dvx_device.c
+index 61ad98c1217..1b50d51e19f 100644
+--- a/src/broadcom/vulkan/v3dvx_device.c
++++ b/src/broadcom/vulkan/v3dvx_device.c
+@@ -50,6 +50,7 @@ vk_to_v3d_compare_func[] = {
+ };
+ 
+ static union pipe_color_union encode_border_color(
++   const struct v3dv_device *device,
+    const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+ {
+    const struct util_format_description *desc =
+@@ -76,12 +77,28 @@ static union pipe_color_union encode_border_color(
+     * colors so we need to fix up the swizzle manually for this case.
+     */
+    uint8_t swizzle[4];
+-   if (v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
++   const bool v3d_has_reverse_swap_rb_bits =
++      v3dv_texture_shader_state_has_rb_swap_reverse_bits(device);
++   if (!v3d_has_reverse_swap_rb_bits &&
++       v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle) &&
+        v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle)) {
+       swizzle[0] = PIPE_SWIZZLE_W;
+       swizzle[1] = PIPE_SWIZZLE_X;
+       swizzle[2] = PIPE_SWIZZLE_Y;
+       swizzle[3] = PIPE_SWIZZLE_Z;
++   }
++   /* In v3d 7.x we no longer have a reverse flag for the border color. Instead
++    * we have to use the new reverse and swap_r/b flags in the texture shader
++    * state which will apply the format swizzle automatically when sampling
++    * the border color too and we should not apply it manually here.
++    */
++   else if (v3d_has_reverse_swap_rb_bits &&
++            (v3dv_format_swizzle_needs_rb_swap(format->planes[0].swizzle) ||
++             v3dv_format_swizzle_needs_reverse(format->planes[0].swizzle))) {
++      swizzle[0] = PIPE_SWIZZLE_X;
++      swizzle[1] = PIPE_SWIZZLE_Y;
++      swizzle[2] = PIPE_SWIZZLE_Z;
++      swizzle[3] = PIPE_SWIZZLE_W;
+    } else {
+       memcpy(swizzle, format->planes[0].swizzle, sizeof (swizzle));
+    }
+@@ -179,7 +196,8 @@ static union pipe_color_union encode_border_color(
+ }
+ 
+ void
+-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
++v3dX(pack_sampler_state)(const struct v3dv_device *device,
++                         struct v3dv_sampler *sampler,
+                          const VkSamplerCreateInfo *pCreateInfo,
+                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info)
+ {
+@@ -221,7 +239,7 @@ v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
+       s.border_color_mode = border_color_mode;
+ 
+       if (s.border_color_mode == V3D_BORDER_COLOR_FOLLOWS) {
+-         union pipe_color_union border = encode_border_color(bc_info);
++         union pipe_color_union border = encode_border_color(device, bc_info);
+ 
+          s.border_color_word_0 = border.ui[0];
+          s.border_color_word_1 = border.ui[1];
+diff --git a/src/broadcom/vulkan/v3dvx_image.c b/src/broadcom/vulkan/v3dvx_image.c
+index ae6eaa88d0c..de984e81220 100644
+--- a/src/broadcom/vulkan/v3dvx_image.c
++++ b/src/broadcom/vulkan/v3dvx_image.c
+@@ -108,25 +108,6 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+ 
+          tex.array_stride_64_byte_aligned = image->planes[iplane].cube_map_stride / 64;
+ 
+-         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
+-#if V3D_VERSION == 42
+-         tex.reverse_standard_border_color = image_view->planes[plane].channel_reverse;
+-#endif
+-
+-#if V3D_VERSION == 42
+-         tex.srgb = is_srgb;
+-#endif
+-#if V3D_VERSION >= 71
+-         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
+-
+-         /* V3D 7.1.5 has array stride starting one bit later than previous
+-          * V3D versions to make room for the new RB swap bit, but we don't
+-          * handle that in the CLE parser.
+-          */
+-         if (device->devinfo.rev >= 5)
+-            tex.array_stride_64_byte_aligned <<= 1;
+-#endif
+-
+          /* At this point we don't have the job. That's the reason the first
+           * parameter is NULL, to avoid a crash when cl_pack_emit_reloc tries to
+           * add the bo to the job. This also means that we need to add manually
+@@ -138,7 +119,44 @@ pack_texture_shader_state_helper(struct v3dv_device *device,
+                               iplane);
+          tex.texture_base_pointer = v3dv_cl_address(NULL, base_offset);
+ 
++         bool is_srgb = vk_format_is_srgb(image_view->vk.format);
++
++         /* V3D 4.x doesn't have the reverse and swap_r/b bits, so we compose
++          * the reverse and/or swap_r/b swizzle from the format table with the
++          * image view swizzle. This, however, doesn't work for border colors,
++          * for that there is the reverse_standard_border_color.
++          *
++          * In v3d 7.x, however, there is no reverse_standard_border_color bit,
++          * since the reverse and swap_r/b bits also affect border colors. It is
++          * because of this that we absolutely need to use these bits with
++          * reversed and swpaped formats, since that's the only way to ensure
++          * correct border colors. In that case we don't want to program the
++          * swizzle to the composition of the format swizzle and the view
++          * swizzle like we do in v3d 4.x, since the format swizzle is applied
++          * via the reverse and swap_r/b bits.
++          */
++#if V3D_VERSION == 42
++         tex.srgb = is_srgb;
++         tex.reverse_standard_border_color =
++            image_view->planes[plane].channel_reverse;
++#endif
+ #if V3D_VERSION >= 71
++         tex.transfer_func = is_srgb ? TRANSFER_FUNC_SRGB : TRANSFER_FUNC_NONE;
++
++         tex.reverse = image_view->planes[plane].channel_reverse;
++         tex.r_b_swap = image_view->planes[plane].swap_rb;
++
++         if (tex.reverse || tex.r_b_swap) {
++            tex.swizzle_r =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[0]);
++            tex.swizzle_g =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[1]);
++            tex.swizzle_b =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[2]);
++            tex.swizzle_a =
++               v3d_translate_pipe_swizzle(image_view->view_swizzle[3]);
++         }
++
+          tex.chroma_offset_x = 1;
+          tex.chroma_offset_y = 1;
+          /* See comment in XML field definition for rationale of the shifts */
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index 1ce4789c5ac..27d6736c0e3 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -131,7 +131,8 @@ v3dX(get_hw_clear_color)(const VkClearColorValue *color,
+ /* Used at v3dv_device */
+ 
+ void
+-v3dX(pack_sampler_state)(struct v3dv_sampler *sampler,
++v3dX(pack_sampler_state)(const struct v3dv_device *device,
++                         struct v3dv_sampler *sampler,
+                          const VkSamplerCreateInfo *pCreateInfo,
+                          const VkSamplerCustomBorderColorCreateInfoEXT *bc_info);
+ 
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 970a082aa85..8cca1a5030b 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -960,12 +960,6 @@ v3d_setup_texture_shader_state(const struct v3d_device_info *devinfo,
+         /* See comment in XML field definition for rationale of the shifts */
+         tex->texture_base_pointer_cb = base_offset >> 6;
+         tex->texture_base_pointer_cr = base_offset >> 6;
+-
+-        /* V3D 7.1.5 has array stride start at bit 33 instead of bit 32 to
+-         * make room for the RB swap bit.
+-         */
+-        if (devinfo->rev >= 5)
+-                tex->array_stride_64_byte_aligned <<= 1;
+ #endif
+ 
+         /* Since other platform devices may produce UIF images even
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch
new file mode 100644
index 0000000000..c991d19da5
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0128-v3dv-fix-color-write-mask-for-v3d-7.x.patch
@@ -0,0 +1,34 @@
+From ef1159ad68e4969992a61b1fcdf9103409f689ca Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 8 Feb 2023 08:41:12 +0100
+Subject: [PATCH 128/142] v3dv: fix color write mask for v3d 7.x
+
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 580aeb8ba2b..6827c829934 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1627,9 +1627,15 @@ v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
+ 
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
+    struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
++   uint32_t color_write_mask = ~dynamic->color_write_enable |
++                               pipeline->blend.color_write_masks;
++#if V3D_VERSION <= 42
++   /* Only 4 RTs */
++   color_write_mask &= 0xffff;
++#endif
++
+    cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
+-      mask.mask = (~dynamic->color_write_enable |
+-                   pipeline->blend.color_write_masks) & 0xffff;
++      mask.mask = color_write_mask;
+    }
+ 
+    cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_COLOR_WRITE_ENABLE;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch
new file mode 100644
index 0000000000..61b2e9a859
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0129-v3d-v3dv-fix-depth-bias-for-v3d-7.x.patch
@@ -0,0 +1,68 @@
+From aee0180b79a6a546d1e7263d89ef868016082687 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 8 Feb 2023 09:04:02 +0100
+Subject: [PATCH 129/142] v3d,v3dv: fix depth bias for v3d 7.x
+
+In v3d 7.x we don't need to scale up depth bias for D16 buffers.
+---
+ src/broadcom/vulkan/v3dvx_cmd_buffer.c | 2 ++
+ src/gallium/drivers/v3d/v3dx_emit.c    | 3 ++-
+ src/gallium/drivers/v3d/v3dx_state.c   | 4 +++-
+ 3 files changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dvx_cmd_buffer.c b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+index 6827c829934..1bd634f5027 100644
+--- a/src/broadcom/vulkan/v3dvx_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dvx_cmd_buffer.c
+@@ -1499,8 +1499,10 @@ v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
+    cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
+       bias.depth_offset_factor = dynamic->depth_bias.slope_factor;
+       bias.depth_offset_units = dynamic->depth_bias.constant_factor;
++#if V3D_VERSION <= 42
+       if (pipeline->depth_bias.is_z16)
+          bias.depth_offset_units *= 256.0f;
++#endif
+       bias.limit = dynamic->depth_bias.depth_bias_clamp;
+    }
+ 
+diff --git a/src/gallium/drivers/v3d/v3dx_emit.c b/src/gallium/drivers/v3d/v3dx_emit.c
+index 87e75281dc9..82a45e44f82 100644
+--- a/src/gallium/drivers/v3d/v3dx_emit.c
++++ b/src/gallium/drivers/v3d/v3dx_emit.c
+@@ -558,7 +558,8 @@ v3dX(emit_state)(struct pipe_context *pctx)
+ 
+         if (v3d->dirty & V3D_DIRTY_RASTERIZER &&
+             v3d->rasterizer->base.offset_tri) {
+-                if (job->zsbuf &&
++                if (v3d->screen->devinfo.ver <= 42 &&
++                    job->zsbuf &&
+                     job->zsbuf->format == PIPE_FORMAT_Z16_UNORM) {
+                         cl_emit_prepacked_sized(&job->bcl,
+                                                 v3d->rasterizer->depth_offset_z16,
+diff --git a/src/gallium/drivers/v3d/v3dx_state.c b/src/gallium/drivers/v3d/v3dx_state.c
+index 8cca1a5030b..a7fad572a2d 100644
+--- a/src/gallium/drivers/v3d/v3dx_state.c
++++ b/src/gallium/drivers/v3d/v3dx_state.c
+@@ -111,9 +111,10 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
+ #endif
+         }
+ 
+-        /* The HW treats polygon offset units based on a Z24 buffer, so we
++        /* V3d 4.x treats polygon offset units based on a Z24 buffer, so we
+          * need to scale up offset_units if we're only Z16.
+          */
++#if V3D_VERSION <= 42
+         v3dx_pack(&so->depth_offset_z16, DEPTH_OFFSET, depth) {
+                 depth.depth_offset_factor = cso->offset_scale;
+                 depth.depth_offset_units = cso->offset_units * 256.0;
+@@ -121,6 +122,7 @@ v3d_create_rasterizer_state(struct pipe_context *pctx,
+                 depth.limit = cso->offset_clamp;
+ #endif
+         }
++#endif
+ 
+         return so;
+ }
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch b/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch
new file mode 100644
index 0000000000..2d0a54aa83
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0130-v3d-v3dv-fix-compute-for-V3D-7.1.6.patch
@@ -0,0 +1,141 @@
+From 221d4079c616752b249cefb352268fce5758b578 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Thu, 9 Mar 2023 19:05:19 +0100
+Subject: [PATCH 130/142] v3d,v3dv: fix compute for V3D 7.1.6+
+
+---
+ src/broadcom/vulkan/v3dv_cmd_buffer.c | 25 +++++++++++++++++++++----
+ src/broadcom/vulkan/v3dv_private.h    |  3 ++-
+ src/broadcom/vulkan/v3dv_queue.c      |  2 +-
+ src/gallium/drivers/v3d/v3dx_draw.c   | 14 +++++++++++---
+ 4 files changed, 35 insertions(+), 9 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+index 36bd7960985..609c7acfa8f 100644
+--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
++++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
+@@ -3816,6 +3816,7 @@ cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
+ 
+ void
+ v3dv_cmd_buffer_rewrite_indirect_csd_job(
++   struct v3dv_device *device,
+    struct v3dv_csd_indirect_cpu_job_info *info,
+    const uint32_t *wg_counts)
+ {
+@@ -3835,8 +3836,15 @@ v3dv_cmd_buffer_rewrite_indirect_csd_job(
+    submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+    submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
+ 
+-   submit->cfg[4] = DIV_ROUND_UP(info->wg_size, 16) *
+-                    (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1;
++   uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
++                          (wg_counts[0] * wg_counts[1] * wg_counts[2]);
++   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
++   if (device->devinfo.ver < 71 ||
++       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
++      submit->cfg[4] = num_batches - 1;
++   } else {
++      submit->cfg[4] = num_batches;
++   }
+    assert(submit->cfg[4] != ~0);
+ 
+    if (info->needs_wg_uniform_rewrite) {
+@@ -3869,6 +3877,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+                           uint32_t **wg_uniform_offsets_out,
+                           uint32_t *wg_size_out)
+ {
++   struct v3dv_device *device = cmd_buffer->device;
+    struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
+    assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
+    struct v3dv_shader_variant *cs_variant =
+@@ -3927,18 +3936,26 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
+    if (wg_size_out)
+       *wg_size_out = wg_size;
+ 
+-   submit->cfg[4] = num_batches - 1;
++   /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
++   if (device->devinfo.ver < 71 ||
++       (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
++      submit->cfg[4] = num_batches - 1;
++   } else {
++      submit->cfg[4] = num_batches;
++   }
+    assert(submit->cfg[4] != ~0);
+ 
+    assert(pipeline->shared_data->assembly_bo);
+    struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
+ 
+    submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
+-   submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+    if (cs_variant->prog_data.base->single_seg)
+       submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
+    if (cs_variant->prog_data.base->threads == 4)
+       submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
++   /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved  */
++   if (device->devinfo.ver < 71)
++      submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+ 
+    if (cs_variant->prog_data.cs->shared_size > 0) {
+       job->csd.shared_memory =
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index 8adb8873efd..2f3ef185126 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -1818,7 +1818,8 @@ void v3dv_cmd_buffer_copy_query_results(struct v3dv_cmd_buffer *cmd_buffer,
+ void v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
+                                  struct drm_v3d_submit_tfu *tfu);
+ 
+-void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_csd_indirect_cpu_job_info *info,
++void v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device *device,
++                                              struct v3dv_csd_indirect_cpu_job_info *info,
+                                               const uint32_t *wg_counts);
+ 
+ void v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
+diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c
+index b4aae195180..429d14a9196 100644
+--- a/src/broadcom/vulkan/v3dv_queue.c
++++ b/src/broadcom/vulkan/v3dv_queue.c
+@@ -408,7 +408,7 @@ handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
+ 
+    if (memcmp(group_counts, info->csd_job->csd.wg_count,
+               sizeof(info->csd_job->csd.wg_count)) != 0) {
+-      v3dv_cmd_buffer_rewrite_indirect_csd_job(info, group_counts);
++      v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
+    }
+ 
+    return VK_SUCCESS;
+diff --git a/src/gallium/drivers/v3d/v3dx_draw.c b/src/gallium/drivers/v3d/v3dx_draw.c
+index e4b414b0676..4e1af41d50e 100644
+--- a/src/gallium/drivers/v3d/v3dx_draw.c
++++ b/src/gallium/drivers/v3d/v3dx_draw.c
+@@ -1473,8 +1473,15 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
+         submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
+ 
+ 
+-        /* Number of batches the dispatch will invoke (minus 1). */
+-        submit.cfg[4] = num_batches - 1;
++        /* Number of batches the dispatch will invoke.
++         * V3D 7.1.6 and later don't subtract 1 from the number of batches
++         */
++        if (v3d->screen->devinfo.ver < 71 ||
++            (v3d->screen->devinfo.ver == 71 && v3d->screen->devinfo.rev < 6)) {
++                submit.cfg[4] = num_batches - 1;
++        } else {
++                submit.cfg[4] = num_batches;
++        }
+ 
+         /* Make sure we didn't accidentally underflow. */
+         assert(submit.cfg[4] != ~0);
+@@ -1482,7 +1489,8 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
+         v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
+         submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
+                          v3d->prog.compute->offset);
+-        submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
++        if (v3d->screen->devinfo.ver < 71)
++                submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
+         if (v3d->prog.compute->prog_data.base->single_seg)
+                 submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
+         if (v3d->prog.compute->prog_data.base->threads == 4)
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch
new file mode 100644
index 0000000000..b4270672ec
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0131-broadcom-add-performance-counters-for-V3D-7.x.patch
@@ -0,0 +1,567 @@
+From be6c7ba62dbdb9c5babd33a518a042dd554679d7 Mon Sep 17 00:00:00 2001
+From: "Juan A. Suarez Romero" <jasuarez@igalia.com>
+Date: Wed, 22 Feb 2023 09:43:40 +0100
+Subject: [PATCH 131/142] broadcom: add performance counters for V3D 7.x
+
+Some of the counters need to be defined correctly.
+
+v2: Remove references to extended performance counters. The hw does
+    not support them.
+
+Signed-off-by: Juan A. Suarez Romero <jasuarez@igalia.com>
+---
+ .../common/v3d_performance_counters.h         | 108 ++++++++++++++++++
+ src/broadcom/simulator/v3d_simulator.c        |   8 +-
+ src/broadcom/simulator/v3dx_simulator.c       |   2 +-
+ src/broadcom/vulkan/meson.build               |   1 +
+ src/broadcom/vulkan/v3dv_private.h            |   7 +-
+ src/broadcom/vulkan/v3dv_query.c              |  43 +------
+ src/broadcom/vulkan/v3dvx_private.h           |   6 +
+ src/broadcom/vulkan/v3dvx_query.c             |  67 +++++++++++
+ src/gallium/drivers/v3d/meson.build           |   2 +-
+ src/gallium/drivers/v3d/v3d_query.c           |  20 +++-
+ src/gallium/drivers/v3d/v3d_query.h           |   6 -
+ src/gallium/drivers/v3d/v3dx_context.h        |  10 ++
+ ...d_query_perfcnt.c => v3dx_query_perfcnt.c} |  12 +-
+ 13 files changed, 233 insertions(+), 59 deletions(-)
+ create mode 100644 src/broadcom/vulkan/v3dvx_query.c
+ rename src/gallium/drivers/v3d/{v3d_query_perfcnt.c => v3dx_query_perfcnt.c} (94%)
+
+diff --git a/src/broadcom/common/v3d_performance_counters.h b/src/broadcom/common/v3d_performance_counters.h
+index 08d750c2cbe..a8f0cff8784 100644
+--- a/src/broadcom/common/v3d_performance_counters.h
++++ b/src/broadcom/common/v3d_performance_counters.h
+@@ -28,6 +28,110 @@
+ #define V3D_PERFCNT_NAME 1
+ #define V3D_PERFCNT_DESCRIPTION 2
+ 
++#ifndef V3D_VERSION
++#  error "The V3D_VERSION macro must be defined"
++#endif
++
++#if (V3D_VERSION >= 71)
++
++static const char *v3d_performance_counters[][3] = {
++   {"CORE", "cycle-count", "[CORE] Cycle counter"},
++   {"CORE", "core-active", "[CORE] Bin/Render/Compute active cycles"},
++   {"CLE", "CLE-bin-thread-active-cycles", "[CLE] Bin thread active cycles"},
++   {"CLE", "CLE-render-thread-active-cycles", "[CLE] Render thread active cycles"},
++   {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
++   {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
++   {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
++   {"FEP", "FEP-clipped-quads", "[FEP] Early-Z/Near/Far clipped quads"},
++   {"FEP", "FEP-valid-quads", "[FEP] Valid quads"},
++   {"TLB", "TLB-quads-not-passing-stencil-test", "[TLB] Quads with no pixels passing the stencil test"},
++   {"TLB", "TLB-quads-not-passing-z-and-stencil-test", "[TLB] Quads with no pixels passing the Z and stencil tests"},
++   {"TLB", "TLB-quads-passing-z-and-stencil-test", "[TLB] Quads with any pixels passing the Z and stencil tests"},
++   {"TLB", "TLB-quads-written-to-color-buffer", "[TLB] Quads with valid pixels written to colour buffer"},
++   {"TLB", "TLB-partial-quads-written-to-color-buffer", "[TLB] Partial quads written to the colour buffer"},
++   {"PTB", "PTB-primitives-need-clipping", "[PTB] Primitives that need clipping"},
++   {"PTB", "PTB-primitives-discarded-outside-viewport", "[PTB] Primitives discarded by being outside the viewport"},
++   {"PTB", "PTB-primitives-binned", "[PTB] Total primitives binned"},
++   {"PTB", "PTB-primitives-discarded-reversed", "[PTB] Primitives that are discarded because they are reversed"},
++   {"QPU", "QPU-total-instr-cache-hit", "[QPU] Total instruction cache hits for all slices"},
++   {"QPU", "QPU-total-instr-cache-miss", "[QPU] Total instruction cache misses for all slices"},
++   {"QPU", "QPU-total-uniform-cache-hit", "[QPU] Total uniforms cache hits for all slices"},
++   {"QPU", "QPU-total-uniform-cache-miss", "[QPU] Total uniforms cache misses for all slices"},
++   {"TMU", "TMU-active-cycles", "[TMU] Active cycles"},
++   {"TMU", "TMU-stalled-cycles", "[TMU] Stalled cycles"},
++   {"TMU", "TMU-total-text-quads-access", "[TMU] Total texture cache accesses"},
++   {"TMU", "TMU-cache-x4-active-cycles", "[TMU] Cache active cycles for x4 access"},
++   {"TMU", "TMU-cache-x4-stalled-cycles", "[TMU] Cache stalled cycles for x4 access"},
++   {"TMU", "TMU-total-text-quads-x4-access", "[TMU] Total texture cache x4 access"},
++   {"L2T", "L2T-total-cache-hit", "[L2T] Total Level 2 cache hits"},
++   {"L2T", "L2T-total-cache-miss", "[L2T] Total Level 2 cache misses"},
++   {"L2T", "L2T-local", "[L2T] Local mode access"},
++   {"L2T", "L2T-writeback", "[L2T] Writeback"},
++   {"L2T", "L2T-zero", "[L2T] Zero"},
++   {"L2T", "L2T-merge", "[L2T] Merge"},
++   {"L2T", "L2T-fill", "[L2T] Fill"},
++   {"L2T", "L2T-stalls-no-wid", "[L2T] Stalls because no WID available"},
++   {"L2T", "L2T-stalls-no-rid", "[L2T] Stalls because no RID available"},
++   {"L2T", "L2T-stalls-queue-full", "[L2T] Stalls because internal queue full"},
++   {"L2T", "L2T-stalls-wrightback", "[L2T] Stalls because writeback in flight"},
++   {"L2T", "L2T-stalls-mem", "[L2T] Stalls because AXI blocks read"},
++   {"L2T", "L2T-stalls-fill", "[L2T] Stalls because fill pending for victim cache-line"},
++   {"L2T", "L2T-hitq", "[L2T] Sent request via hit queue"},
++   {"L2T", "L2T-hitq-full", "[L2T] Sent request via main queue because hit queue is full"},
++   {"L2T", "L2T-stalls-read-data", "[L2T] Stalls because waiting for data from SDRAM"},
++   {"L2T", "L2T-TMU-read-hits", "[L2T] TMU read hits"},
++   {"L2T", "L2T-TMU-read-miss", "[L2T] TMU read misses"},
++   {"L2T", "L2T-VCD-read-hits", "[L2T] VCD read hits"},
++   {"L2T", "L2T-VCD-read-miss", "[L2T] VCD read misses"},
++   {"L2T", "L2T-SLC-read-hits", "[L2T] SLC read hits (all slices)"},
++   {"L2T", "L2T-SLC-read-miss", "[L2T] SLC read misses (all slices)"},
++   {"AXI", "AXI-writes-seen-watch-0", "[AXI] Writes seen by watch 0"},
++   {"AXI", "AXI-reads-seen-watch-0", "[AXI] Reads seen by watch 0"},
++   {"AXI", "AXI-writes-stalled-seen-watch-0", "[AXI] Write stalls seen by watch 0"},
++   {"AXI", "AXI-reads-stalled-seen-watch-0", "[AXI] Read stalls seen by watch 0"},
++   {"AXI", "AXI-write-bytes-seen-watch-0", "[AXI] Total bytes written seen by watch 0"},
++   {"AXI", "AXI-read-bytes-seen-watch-0", "[AXI] Total bytes read seen by watch 0"},
++   {"AXI", "AXI-writes-seen-watch-1", "[AXI] Writes seen by watch 1"},
++   {"AXI", "AXI-reads-seen-watch-1", "[AXI] Reads seen by watch 1"},
++   {"AXI", "AXI-writes-stalled-seen-watch-1", "[AXI] Write stalls seen by watch 1"},
++   {"AXI", "AXI-reads-stalled-seen-watch-1", "[AXI] Read stalls seen by watch 1"},
++   {"AXI", "AXI-write-bytes-seen-watch-1", "[AXI] Total bytes written seen by watch 1"},
++   {"AXI", "AXI-read-bytes-seen-watch-1", "[AXI] Total bytes read seen by watch 1"},
++   {"CORE", "core-memory-writes", "[CORE] Total memory writes"},
++   {"L2T", "L2T-memory-writes", "[L2T] Total memory writes"},
++   {"PTB", "PTB-memory-writes", "[PTB] Total memory writes"},
++   {"TLB", "TLB-memory-writes", "[TLB] Total memory writes"},
++   {"CORE", "core-memory-reads", "[CORE] Total memory reads"},
++   {"L2T", "L2T-memory-reads", "[L2T] Total memory reads"},
++   {"PTB", "PTB-memory-reads", "[PTB] Total memory reads"},
++   {"PSE", "PSE-memory-reads", "[PSE] Total memory reads"},
++   {"TLB", "TLB-memory-reads", "[TLB] Total memory reads"},
++   {"PTB", "PTB-memory-words-writes", "[PTB] Total memory words written"},
++   {"TLB", "TLB-memory-words-writes", "[TLB] Total memory words written"},
++   {"PSE", "PSE-memory-words-reads", "[PSE] Total memory words read"},
++   {"TLB", "TLB-memory-words-reads", "[TLB] Total memory words read"},
++   {"AXI", "AXI-read-trans", "[AXI] Read transaction count"},
++   {"AXI", "AXI-write-trans", "[AXI] Write transaction count"},
++   {"AXI", "AXI-read-wait-cycles", "[AXI] Read total wait cycles"},
++   {"AXI", "AXI-write-wait-cycles", "[AXI] Write total wait cycles"},
++   {"AXI", "AXI-max-outstanding-reads", "[AXI] Maximium outstanding read transactions"},
++   {"AXI", "AXI-max-outstanding-writes", "[AXI] Maximum outstanding write transactions"},
++   {"QPU", "QPU-wait-bubble", "[QPU] Pipeline bubble in qcycles due all threads waiting"},
++   {"QPU", "QPU-ic-miss-bubble", "[QPU] Pipeline bubble in qcycles due instruction-cache miss"},
++   {"QPU", "QPU-active", "[QPU] Executed shader instruction"},
++   {"QPU", "QPU-total-active-clk-cycles-fragment-shading", "[QPU] Total active clock cycles for all QPUs doing fragment shading (counts only when QPU is not stalled)"},
++   {"QPU", "QPU-stalls", "[QPU] Stalled qcycles executing shader instruction"},
++   {"QPU", "QPU-total-clk-cycles-waiting-fragment-shading", "[QPU] Total stalled clock cycles for all QPUs doing fragment shading"},
++   {"QPU", "QPU-stalls-TMU", "[QPU] Stalled qcycles waiting for TMU"},
++   {"QPU", "QPU-stalls-TLB", "[QPU] Stalled qcycles waiting for TLB"},
++   {"QPU", "QPU-stalls-VPM", "[QPU] Stalled qcycles waiting for VPM"},
++   {"QPU", "QPU-stalls-uniforms", "[QPU] Stalled qcycles waiting for uniforms"},
++   {"QPU", "QPU-stalls-SFU", "[QPU] Stalled qcycles waiting for SFU"},
++   {"QPU", "QPU-stalls-other", "[QPU] Stalled qcycles waiting for any other reason (vary/W/Z)"},
++};
++
++#elif (V3D_VERSION >= 41)
++
+ static const char *v3d_performance_counters[][3] = {
+    {"FEP", "FEP-valid-primitives-no-rendered-pixels", "[FEP] Valid primitives that result in no rendered pixels, for all rendered tiles"},
+    {"FEP", "FEP-valid-primitives-rendered-pixels", "[FEP] Valid primitives for all rendered tiles (primitives may be counted in more than one tile)"},
+@@ -118,4 +222,8 @@ static const char *v3d_performance_counters[][3] = {
+    {"CORE", "compute-active-cycles", "[CORE] Compute active cycles"},
+ };
+ 
++#else
++static const char *v3d_performance_counters[][3] = { };
++#endif
++
+ #endif
+diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
+index 5cceb1a82cc..36e719296f4 100644
+--- a/src/broadcom/simulator/v3d_simulator.c
++++ b/src/broadcom/simulator/v3d_simulator.c
+@@ -92,6 +92,9 @@ static struct v3d_simulator_state {
+         /** Last performance monitor ID. */
+         uint32_t last_perfid;
+ 
++        /** Total performance counters */
++        uint32_t perfcnt_total;
++
+         struct util_dynarray bin_oom;
+         int refcount;
+ } sim_state = {
+@@ -751,7 +754,7 @@ v3d_simulator_perfmon_create_ioctl(int fd, struct drm_v3d_perfmon_create *args)
+ 
+         perfmon->ncounters = args->ncounters;
+         for (int i = 0; i < args->ncounters; i++) {
+-                if (args->counters[i] >= V3D_PERFCNT_NUM) {
++                if (args->counters[i] >= sim_state.perfcnt_total) {
+                         ralloc_free(perfmon);
+                         return -EINVAL;
+                 } else {
+@@ -918,13 +921,16 @@ v3d_simulator_init_global()
+         switch(sim_state.ver) {
+         case 33:
+                 v3d33_simulator_init_regs(sim_state.v3d);
++                sim_state.perfcnt_total = 0;
+                 break;
+         case 41:
+         case 42:
+                 v3d41_simulator_init_regs(sim_state.v3d);
++                sim_state.perfcnt_total = 87;
+                 break;
+         case 71:
+                 v3d71_simulator_init_regs(sim_state.v3d);
++                sim_state.perfcnt_total = 93;
+                 break;
+         default:
+                 unreachable("Not supported V3D version\n");
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index 4ea177c9bb7..4520fe75719 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -50,7 +50,7 @@
+ #include "libs/core/v3d/registers/7.1.5.1/v3d.h"
+ #else
+ #if V3D_VERSION == 41 || V3D_VERSION == 42
+-#include "libs/core/v3d/registers/4.1.35.0/v3d.h"
++#include "libs/core/v3d/registers/4.2.14.0/v3d.h"
+ #else
+ #include "libs/core/v3d/registers/3.3.0.0/v3d.h"
+ #endif
+diff --git a/src/broadcom/vulkan/meson.build b/src/broadcom/vulkan/meson.build
+index 3da7364686f..182388a35b4 100644
+--- a/src/broadcom/vulkan/meson.build
++++ b/src/broadcom/vulkan/meson.build
+@@ -65,6 +65,7 @@ files_per_version = files(
+   'v3dvx_pipeline.c',
+   'v3dvx_meta_common.c',
+   'v3dvx_pipeline.c',
++  'v3dvx_query.c',
+   'v3dvx_queue.c',
+ )
+ 
+diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h
+index 2f3ef185126..89e2f1c7e5c 100644
+--- a/src/broadcom/vulkan/v3dv_private.h
++++ b/src/broadcom/vulkan/v3dv_private.h
+@@ -123,6 +123,9 @@ struct v3d_simulator_file;
+ /* Minimum required by the Vulkan 1.1 spec */
+ #define MAX_MEMORY_ALLOCATION_SIZE (1ull << 30)
+ 
++/* Maximum performance counters number */
++#define V3D_MAX_PERFCNT 93
++
+ struct v3dv_physical_device {
+    struct vk_physical_device vk;
+ 
+@@ -1210,7 +1213,7 @@ struct v3dv_timestamp_query_cpu_job_info {
+ };
+ 
+ /* Number of perfmons required to handle all supported performance counters */
+-#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_PERFCNT_NUM, \
++#define V3DV_MAX_PERFMONS DIV_ROUND_UP(V3D_MAX_PERFCNT, \
+                                        DRM_V3D_MAX_PERF_COUNTERS)
+ 
+ struct v3dv_perf_query {
+@@ -1682,7 +1685,7 @@ struct v3dv_query_pool {
+    /* Only used with performance queries */
+    struct {
+       uint32_t ncounters;
+-      uint8_t counters[V3D_PERFCNT_NUM];
++      uint8_t counters[V3D_MAX_PERFCNT];
+ 
+       /* V3D has a limit on the number of counters we can track in a
+        * single performance monitor, so if too many counters are requested
+diff --git a/src/broadcom/vulkan/v3dv_query.c b/src/broadcom/vulkan/v3dv_query.c
+index 3284c467d74..deb7821f02b 100644
+--- a/src/broadcom/vulkan/v3dv_query.c
++++ b/src/broadcom/vulkan/v3dv_query.c
+@@ -23,7 +23,6 @@
+ 
+ #include "v3dv_private.h"
+ 
+-#include "common/v3d_performance_counters.h"
+ #include "util/timespec.h"
+ #include "compiler/nir/nir_builder.h"
+ 
+@@ -48,7 +47,7 @@ kperfmon_create(struct v3dv_device *device,
+                            DRM_IOCTL_V3D_PERFMON_CREATE,
+                            &req);
+       if (ret)
+-         fprintf(stderr, "Failed to create perfmon: %s\n", strerror(ret));
++         fprintf(stderr, "Failed to create perfmon for query %d: %s\n", query, strerror(ret));
+ 
+       pool->queries[query].perf.kperfmon_ids[i] = req.id;
+    }
+@@ -303,7 +302,6 @@ v3dv_CreateQueryPool(VkDevice _device,
+                               QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR);
+ 
+       assert(pq_info);
+-      assert(pq_info->counterIndexCount <= V3D_PERFCNT_NUM);
+ 
+       pool->perfmon.ncounters = pq_info->counterIndexCount;
+       for (uint32_t i = 0; i < pq_info->counterIndexCount; i++)
+@@ -592,7 +590,7 @@ write_performance_query_result(struct v3dv_device *device,
+    assert(pool && pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
+ 
+    struct v3dv_query *q = &pool->queries[query];
+-   uint64_t counter_values[V3D_PERFCNT_NUM];
++   uint64_t counter_values[V3D_MAX_PERFCNT];
+ 
+    for (uint32_t i = 0; i < pool->perfmon.nperfmons; i++) {
+       struct drm_v3d_perfmon_get_values req = {
+@@ -1284,40 +1282,11 @@ v3dv_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR(
+    VkPerformanceCounterKHR *pCounters,
+    VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
+ {
+-   uint32_t desc_count = *pCounterCount;
++   V3DV_FROM_HANDLE(v3dv_physical_device, pDevice, physicalDevice);
+ 
+-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
+-                          out, pCounters, pCounterCount);
+-   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
+-                          out_desc, pCounterDescriptions, &desc_count);
+-
+-   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
+-      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
+-         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
+-         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
+-         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
+-
+-         unsigned char sha1_result[20];
+-         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
+-                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
+-                            sha1_result);
+-
+-         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
+-      }
+-
+-      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
+-                               &out_desc, desc) {
+-         desc->flags = 0;
+-         snprintf(desc->name, sizeof(desc->name), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
+-         snprintf(desc->category, sizeof(desc->category), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
+-         snprintf(desc->description, sizeof(desc->description), "%s",
+-            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
+-      }
+-   }
+-
+-   return vk_outarray_status(&out);
++   return v3dv_X(pDevice, enumerate_performance_query_counters)(pCounterCount,
++                                                                pCounters,
++                                                                pCounterDescriptions);
+ }
+ 
+ VKAPI_ATTR void VKAPI_CALL
+diff --git a/src/broadcom/vulkan/v3dvx_private.h b/src/broadcom/vulkan/v3dvx_private.h
+index 27d6736c0e3..0f5887eab93 100644
+--- a/src/broadcom/vulkan/v3dvx_private.h
++++ b/src/broadcom/vulkan/v3dvx_private.h
+@@ -324,6 +324,12 @@ v3dX(create_default_attribute_values)(struct v3dv_device *device,
+ void
+ v3dX(job_emit_noop)(struct v3dv_job *job);
+ 
++/* Used at v3dv_query */
++VkResult
++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
++                                           VkPerformanceCounterKHR *pCounters,
++                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions);
++
+ /* Used at v3dv_descriptor_set, and other descriptor set utils */
+ uint32_t v3dX(descriptor_bo_size)(VkDescriptorType type);
+ 
+diff --git a/src/broadcom/vulkan/v3dvx_query.c b/src/broadcom/vulkan/v3dvx_query.c
+new file mode 100644
+index 00000000000..e59a1e84ff6
+--- /dev/null
++++ b/src/broadcom/vulkan/v3dvx_query.c
+@@ -0,0 +1,67 @@
++/*
++ * Copyright © 2023 Raspberry Pi Ltd
++ *
++ * Permission is hereby granted, free of charge, to any person obtaining a
++ * copy of this software and associated documentation files (the "Software"),
++ * to deal in the Software without restriction, including without limitation
++ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
++ * and/or sell copies of the Software, and to permit persons to whom the
++ * Software is furnished to do so, subject to the following conditions:
++ *
++ * The above copyright notice and this permission notice (including the next
++ * paragraph) shall be included in all copies or substantial portions of the
++ * Software.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
++ * IN THE SOFTWARE.
++ */
++
++#include "v3dv_private.h"
++
++#include "common/v3d_performance_counters.h"
++
++VkResult
++v3dX(enumerate_performance_query_counters)(uint32_t *pCounterCount,
++                                           VkPerformanceCounterKHR *pCounters,
++                                           VkPerformanceCounterDescriptionKHR *pCounterDescriptions)
++{
++   uint32_t desc_count = *pCounterCount;
++
++   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterKHR,
++                          out, pCounters, pCounterCount);
++   VK_OUTARRAY_MAKE_TYPED(VkPerformanceCounterDescriptionKHR,
++                          out_desc, pCounterDescriptions, &desc_count);
++
++   for (int i = 0; i < ARRAY_SIZE(v3d_performance_counters); i++) {
++      vk_outarray_append_typed(VkPerformanceCounterKHR, &out, counter) {
++         counter->unit = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR;
++         counter->scope = VK_PERFORMANCE_COUNTER_SCOPE_COMMAND_KHR;
++         counter->storage = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR;
++
++         unsigned char sha1_result[20];
++         _mesa_sha1_compute(v3d_performance_counters[i][V3D_PERFCNT_NAME],
++                            strlen(v3d_performance_counters[i][V3D_PERFCNT_NAME]),
++                            sha1_result);
++
++         memcpy(counter->uuid, sha1_result, sizeof(counter->uuid));
++      }
++
++      vk_outarray_append_typed(VkPerformanceCounterDescriptionKHR,
++                               &out_desc, desc) {
++         desc->flags = 0;
++         snprintf(desc->name, sizeof(desc->name), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_NAME]);
++         snprintf(desc->category, sizeof(desc->category), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_CATEGORY]);
++         snprintf(desc->description, sizeof(desc->description), "%s",
++            v3d_performance_counters[i][V3D_PERFCNT_DESCRIPTION]);
++      }
++   }
++
++   return vk_outarray_status(&out);
++}
+diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
+index b2e748573b7..289473d2ca1 100644
+--- a/src/gallium/drivers/v3d/meson.build
++++ b/src/gallium/drivers/v3d/meson.build
+@@ -34,7 +34,6 @@ files_libv3d = files(
+   'v3d_query.c',
+   'v3d_query.h',
+   'v3d_query_pipe.c',
+-  'v3d_query_perfcnt.c',
+   'v3d_resource.c',
+   'v3d_resource.h',
+   'v3d_screen.c',
+@@ -47,6 +46,7 @@ files_per_version = files(
+   'v3dx_emit.c',
+   'v3dx_format_table.c',
+   'v3dx_job.c',
++  'v3dx_query_perfcnt.c',
+   'v3dx_rcl.c',
+   'v3dx_state.c',
+   'v3dx_tfu.c',
+diff --git a/src/gallium/drivers/v3d/v3d_query.c b/src/gallium/drivers/v3d/v3d_query.c
+index db98c89625f..83f82e44a3d 100644
+--- a/src/gallium/drivers/v3d/v3d_query.c
++++ b/src/gallium/drivers/v3d/v3d_query.c
+@@ -28,8 +28,11 @@ v3d_get_driver_query_group_info(struct pipe_screen *pscreen, unsigned index,
+                                 struct pipe_driver_query_group_info *info)
+ {
+         struct v3d_screen *screen = v3d_screen(pscreen);
++        struct v3d_device_info *devinfo = &screen->devinfo;
+ 
+-        return v3d_get_driver_query_group_info_perfcnt(screen, index, info);
++        return v3d_X(devinfo, get_driver_query_group_info_perfcnt)(screen,
++                                                                   index,
++                                                                   info);
+ }
+ 
+ int
+@@ -37,8 +40,11 @@ v3d_get_driver_query_info(struct pipe_screen *pscreen, unsigned index,
+                           struct pipe_driver_query_info *info)
+ {
+         struct v3d_screen *screen = v3d_screen(pscreen);
++        struct v3d_device_info *devinfo = &screen->devinfo;
+ 
+-        return v3d_get_driver_query_info_perfcnt(screen, index, info);
++        return v3d_X(devinfo, get_driver_query_info_perfcnt)(screen,
++                                                             index,
++                                                             info);
+ }
+ 
+ static struct pipe_query *
+@@ -53,9 +59,13 @@ static struct pipe_query *
+ v3d_create_batch_query(struct pipe_context *pctx, unsigned num_queries,
+                        unsigned *query_types)
+ {
+-        return v3d_create_batch_query_perfcnt(v3d_context(pctx),
+-                                              num_queries,
+-                                              query_types);
++        struct v3d_context *v3d = v3d_context(pctx);
++        struct v3d_screen *screen = v3d->screen;
++        struct v3d_device_info *devinfo = &screen->devinfo;
++
++        return v3d_X(devinfo, create_batch_query_perfcnt)(v3d_context(pctx),
++                                                          num_queries,
++                                                          query_types);
+ }
+ 
+ static void
+diff --git a/src/gallium/drivers/v3d/v3d_query.h b/src/gallium/drivers/v3d/v3d_query.h
+index 3e1426b8d86..605ed1a12f9 100644
+--- a/src/gallium/drivers/v3d/v3d_query.h
++++ b/src/gallium/drivers/v3d/v3d_query.h
+@@ -42,11 +42,5 @@ struct v3d_query
+ };
+ 
+ struct pipe_query *v3d_create_query_pipe(struct v3d_context *v3d, unsigned query_type, unsigned index);
+-struct pipe_query *v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
+-                                                  unsigned *query_types);
+-int v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                            struct pipe_driver_query_group_info *info);
+-int v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                      struct pipe_driver_query_info *info);
+ 
+ #endif /* V3D_QUERY_H */
+diff --git a/src/gallium/drivers/v3d/v3dx_context.h b/src/gallium/drivers/v3d/v3dx_context.h
+index e0a5cbfb2f3..c487ac3b996 100644
+--- a/src/gallium/drivers/v3d/v3dx_context.h
++++ b/src/gallium/drivers/v3d/v3dx_context.h
+@@ -61,3 +61,13 @@ bool v3dX(tfu)(struct pipe_context *pctx,
+                unsigned int src_layer,
+                unsigned int dst_layer,
+                bool for_mipmap);
++
++int v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen,
++                                              unsigned index,
++                                              struct pipe_driver_query_group_info *info);
++int v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen,
++                                        unsigned index,
++                                        struct pipe_driver_query_info *info);
++struct pipe_query *v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d,
++                                                    unsigned num_queries,
++                                                    unsigned *query_types);
+diff --git a/src/gallium/drivers/v3d/v3d_query_perfcnt.c b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+similarity index 94%
+rename from src/gallium/drivers/v3d/v3d_query_perfcnt.c
+rename to src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+index e00d84e375f..431aad14b4f 100644
+--- a/src/gallium/drivers/v3d/v3d_query_perfcnt.c
++++ b/src/gallium/drivers/v3d/v3dx_query_perfcnt.c
+@@ -52,8 +52,8 @@ kperfmon_destroy(struct v3d_context *v3d, struct v3d_perfmon_state *perfmon)
+ }
+ 
+ int
+-v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                        struct pipe_driver_query_group_info *info)
++v3dX(get_driver_query_group_info_perfcnt)(struct v3d_screen *screen, unsigned index,
++                                          struct pipe_driver_query_group_info *info)
+ {
+         if (!screen->has_perfmon)
+                 return 0;
+@@ -72,8 +72,8 @@ v3d_get_driver_query_group_info_perfcnt(struct v3d_screen *screen, unsigned inde
+ }
+ 
+ int
+-v3d_get_driver_query_info_perfcnt(struct v3d_screen *screen, unsigned index,
+-                                  struct pipe_driver_query_info *info)
++v3dX(get_driver_query_info_perfcnt)(struct v3d_screen *screen, unsigned index,
++                                    struct pipe_driver_query_info *info)
+ {
+         if (!screen->has_perfmon)
+                 return 0;
+@@ -222,8 +222,8 @@ static const struct v3d_query_funcs perfcnt_query_funcs = {
+ };
+ 
+ struct pipe_query *
+-v3d_create_batch_query_perfcnt(struct v3d_context *v3d, unsigned num_queries,
+-                               unsigned *query_types)
++v3dX(create_batch_query_perfcnt)(struct v3d_context *v3d, unsigned num_queries,
++                                 unsigned *query_types)
+ {
+         struct v3d_query_perfcnt *pquery = NULL;
+         struct v3d_query *query;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch b/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch
new file mode 100644
index 0000000000..25d4e26ca4
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0132-broadcom-simulator-add-per-hw-version-calls.patch
@@ -0,0 +1,239 @@
+From f7d5b57bca07eb9ba6fb292852e3b5057c0a8b8f Mon Sep 17 00:00:00 2001
+From: "Juan A. Suarez Romero" <jasuarez@igalia.com>
+Date: Mon, 20 Mar 2023 16:48:51 +0100
+Subject: [PATCH 132/142] broadcom/simulator: add per-hw version calls
+
+Add a wrapper to allow calling the right simulator function based on the
+hardware under simulation.
+
+Signed-off-by: Juan A. Suarez Romero <jasuarez@igalia.com>
+---
+ src/broadcom/simulator/v3d_simulator.c  | 86 ++++---------------------
+ src/broadcom/simulator/v3d_simulator.h  | 21 ++++++
+ src/broadcom/simulator/v3dx_simulator.c |  9 ++-
+ 3 files changed, 41 insertions(+), 75 deletions(-)
+
+diff --git a/src/broadcom/simulator/v3d_simulator.c b/src/broadcom/simulator/v3d_simulator.c
+index 36e719296f4..c4bbd61abc2 100644
+--- a/src/broadcom/simulator/v3d_simulator.c
++++ b/src/broadcom/simulator/v3d_simulator.c
+@@ -439,15 +439,15 @@ v3d_simulator_perfmon_switch(int fd, uint32_t perfid)
+ 
+         perfmon = v3d_get_simulator_perfmon(fd, file->active_perfid);
+         if (perfmon)
+-                v3d41_simulator_perfmon_stop(sim_state.v3d,
+-                                             perfmon->ncounters,
+-                                             perfmon->values);
++                v3d_X_simulator(perfmon_stop)(sim_state.v3d,
++                                              perfmon->ncounters,
++                                              perfmon->values);
+ 
+         perfmon = v3d_get_simulator_perfmon(fd, perfid);
+         if (perfmon)
+-                v3d41_simulator_perfmon_start(sim_state.v3d,
+-                                              perfmon->ncounters,
+-                                              perfmon->counters);
++                v3d_X_simulator(perfmon_start)(sim_state.v3d,
++                                               perfmon->ncounters,
++                                               perfmon->counters);
+ 
+         file->active_perfid = perfid;
+ }
+@@ -492,21 +492,7 @@ v3d_simulator_submit_cl_ioctl(int fd, struct drm_v3d_submit_cl *submit)
+         bin_fd = fd;
+ 
+         v3d_simulator_perfmon_switch(fd, submit->perfmon_id);
+-
+-        switch(sim_state.ver) {
+-        case 33:
+-           v3d33_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+-           break;
+-        case 41:
+-        case 42:
+-           v3d41_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+-           break;
+-        case 71:
+-           v3d71_simulator_submit_cl_ioctl(sim_state.v3d, submit, file->gmp->ofs);
+-           break;
+-        default:
+-           unreachable("Unsupported V3D version\n");
+-        }
++        v3d_X_simulator(submit_cl_ioctl)(sim_state.v3d, submit, file->gmp->ofs);
+ 
+         util_dynarray_foreach(&sim_state.bin_oom, struct v3d_simulator_bo *,
+                               sim_bo) {
+@@ -645,22 +631,6 @@ v3d_simulator_gem_close_ioctl(int fd, struct drm_gem_close *args)
+         return drmIoctl(fd, DRM_IOCTL_GEM_CLOSE, args);
+ }
+ 
+-static int
+-v3d_simulator_get_param_ioctl(int fd, struct drm_v3d_get_param *args)
+-{
+-        switch(sim_state.ver) {
+-        case 33:
+-                return v3d33_simulator_get_param_ioctl(sim_state.v3d, args);
+-        case 41:
+-        case 42:
+-                return v3d41_simulator_get_param_ioctl(sim_state.v3d, args);
+-        case 71:
+-                return v3d71_simulator_get_param_ioctl(sim_state.v3d, args);
+-        default:
+-                unreachable("Unsupported V3D version\n");
+-        }
+-}
+-
+ static int
+ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
+ {
+@@ -672,20 +642,7 @@ v3d_simulator_submit_tfu_ioctl(int fd, struct drm_v3d_submit_tfu *args)
+         v3d_simulator_copy_in_handle(file, args->bo_handles[2]);
+         v3d_simulator_copy_in_handle(file, args->bo_handles[3]);
+ 
+-        switch(sim_state.ver) {
+-        case 33:
+-                ret = v3d33_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+-                break;
+-        case 41:
+-        case 42:
+-                ret = v3d41_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+-                break;
+-        case 71:
+-                ret = v3d71_simulator_submit_tfu_ioctl(sim_state.v3d, args);
+-                break;
+-        default:
+-                unreachable("Unsupported V3D version\n");
+-        }
++        ret = v3d_X_simulator(submit_tfu_ioctl)(sim_state.v3d, args);
+ 
+         v3d_simulator_copy_out_handle(file, args->bo_handles[0]);
+ 
+@@ -712,19 +669,8 @@ v3d_simulator_submit_csd_ioctl(int fd, struct drm_v3d_submit_csd *args)
+ 
+         v3d_simulator_perfmon_switch(fd, args->perfmon_id);
+ 
+-        switch(sim_state.ver) {
+-        case 41:
+-        case 42:
+-           ret = v3d41_simulator_submit_csd_ioctl(sim_state.v3d, args,
+-                                                  file->gmp->ofs);
+-           break;
+-        case 71:
+-           ret = v3d71_simulator_submit_csd_ioctl(sim_state.v3d, args,
+-                                                  file->gmp->ofs);
+-           break;
+-        default:
+-           ret = -1;
+-        }
++        ret = v3d_X_simulator(submit_csd_ioctl)(sim_state.v3d, args,
++                                                file->gmp->ofs);
+ 
+         for (int i = 0; i < args->bo_handle_count; i++)
+                 v3d_simulator_copy_out_handle(file, bo_handles[i]);
+@@ -835,7 +781,7 @@ v3d_simulator_ioctl(int fd, unsigned long request, void *args)
+                 return 0;
+ 
+         case DRM_IOCTL_V3D_GET_PARAM:
+-                return v3d_simulator_get_param_ioctl(fd, args);
++                return v3d_X_simulator(get_param_ioctl)(sim_state.v3d, args);
+ 
+         case DRM_IOCTL_GEM_CLOSE:
+                 return v3d_simulator_gem_close_ioctl(fd, args);
+@@ -918,22 +864,18 @@ v3d_simulator_init_global()
+ 
+         util_dynarray_init(&sim_state.bin_oom, NULL);
+ 
++        v3d_X_simulator(init_regs)(sim_state.v3d);
++
+         switch(sim_state.ver) {
+-        case 33:
+-                v3d33_simulator_init_regs(sim_state.v3d);
+-                sim_state.perfcnt_total = 0;
+-                break;
+         case 41:
+         case 42:
+-                v3d41_simulator_init_regs(sim_state.v3d);
+                 sim_state.perfcnt_total = 87;
+                 break;
+         case 71:
+-                v3d71_simulator_init_regs(sim_state.v3d);
+                 sim_state.perfcnt_total = 93;
+                 break;
+         default:
+-                unreachable("Not supported V3D version\n");
++                sim_state.perfcnt_total = 0;
+         }
+ }
+ 
+diff --git a/src/broadcom/simulator/v3d_simulator.h b/src/broadcom/simulator/v3d_simulator.h
+index 1472c313a03..92305634468 100644
+--- a/src/broadcom/simulator/v3d_simulator.h
++++ b/src/broadcom/simulator/v3d_simulator.h
+@@ -59,4 +59,25 @@ uint32_t v3d_simulator_get_mem_free(void);
+ 
+ #endif
+ 
++/* Helper to call simulator ver specific functions */
++#define v3d_X_simulator(thing) ({                     \
++   __typeof(&v3d33_simulator_##thing) v3d_X_sim_thing;\
++   switch (sim_state.ver) {                           \
++   case 33:                                           \
++   case 40:                                           \
++      v3d_X_sim_thing = &v3d33_simulator_##thing;     \
++      break;                                          \
++   case 41:                                           \
++   case 42:                                           \
++      v3d_X_sim_thing = &v3d41_simulator_##thing;     \
++      break;                                          \
++   case 71:                                           \
++      v3d_X_sim_thing = &v3d71_simulator_##thing;     \
++      break;                                          \
++   default:                                           \
++      unreachable("Unsupported hardware generation"); \
++   }                                                  \
++   v3d_X_sim_thing;                                   \
++})
++
+ #endif
+diff --git a/src/broadcom/simulator/v3dx_simulator.c b/src/broadcom/simulator/v3dx_simulator.c
+index 4520fe75719..01cf6b22663 100644
+--- a/src/broadcom/simulator/v3dx_simulator.c
++++ b/src/broadcom/simulator/v3dx_simulator.c
+@@ -218,12 +218,12 @@ v3dX(simulator_submit_tfu_ioctl)(struct v3d_hw *v3d,
+         return 0;
+ }
+ 
+-#if V3D_VERSION >= 41
+ int
+ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+                                  struct drm_v3d_submit_csd *args,
+                                  uint32_t gmp_ofs)
+ {
++#if V3D_VERSION >= 41
+         int last_completed_jobs = (V3D_READ(V3D_CSD_0_STATUS) &
+                                    V3D_CSD_0_STATUS_NUM_COMPLETED_JOBS_SET);
+         g_gmp_ofs = gmp_ofs;
+@@ -256,8 +256,10 @@ v3dX(simulator_submit_csd_ioctl)(struct v3d_hw *v3d,
+         v3d_flush_caches(v3d);
+ 
+         return 0;
+-}
++#else
++        return -1;
+ #endif
++}
+ 
+ int
+ v3dX(simulator_get_param_ioctl)(struct v3d_hw *v3d,
+@@ -545,7 +547,8 @@ v3dX(simulator_submit_cl_ioctl)(struct v3d_hw *v3d,
+ #define V3D_PCTR_0_SRC_N(x) (V3D_PCTR_0_SRC_0_3 + 4 * (x))
+ #define V3D_PCTR_0_SRC_N_SHIFT(x) ((x) * 8)
+ #define V3D_PCTR_0_SRC_N_MASK(x) (BITFIELD_RANGE(V3D_PCTR_0_SRC_N_SHIFT(x), \
+-                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + 6))
++                                                 V3D_PCTR_0_SRC_N_SHIFT(x) + \
++                                                 V3D_PCTR_0_SRC_0_3_PCTRS0_MSB))
+ #endif
+ 
+ void
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch
new file mode 100644
index 0000000000..8b238d4963
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0133-v3dv-expose-fullDrawIndexUint32-in-V3D-7.x.patch
@@ -0,0 +1,35 @@
+From 151c13365703631f88ad77ba07afbd2ba9fa172c Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 31 May 2023 09:23:51 +0200
+Subject: [PATCH 133/142] v3dv: expose fullDrawIndexUint32 in V3D 7.x
+
+---
+ src/broadcom/vulkan/v3dv_device.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index b520bfa0002..ca5f676b6f7 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -214,7 +214,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+    *features = (struct vk_features) {
+       /* Vulkan 1.0 */
+       .robustBufferAccess = true, /* This feature is mandatory */
+-      .fullDrawIndexUint32 = false, /* Only available since V3D 4.4.9.1 */
++      .fullDrawIndexUint32 = physical_device->devinfo.ver >= 71,
+       .imageCubeArray = true,
+       .independentBlend = true,
+       .geometryShader = true,
+@@ -1451,7 +1451,8 @@ v3dv_GetPhysicalDeviceProperties(VkPhysicalDevice physicalDevice,
+       .subPixelPrecisionBits                    = V3D_COORD_SHIFT,
+       .subTexelPrecisionBits                    = 8,
+       .mipmapPrecisionBits                      = 8,
+-      .maxDrawIndexedIndexValue                 = 0x00ffffff,
++      .maxDrawIndexedIndexValue                 = pdevice->devinfo.ver >= 71 ?
++                                                  0xffffffff : 0x00ffffff,
+       .maxDrawIndirectCount                     = 0x7fffffff,
+       .maxSamplerLodBias                        = 14.0f,
+       .maxSamplerAnisotropy                     = 16.0f,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch
new file mode 100644
index 0000000000..6f906ff11d
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0134-v3dv-expose-depthClamp-in-V3D-7.x.patch
@@ -0,0 +1,56 @@
+From aec0c613e651984e577f580aedceb3561d6a3b19 Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 31 May 2023 10:38:59 +0200
+Subject: [PATCH 134/142] v3dv: expose depthClamp in V3D 7.x
+
+---
+ src/broadcom/vulkan/v3dv_device.c    | 2 +-
+ src/broadcom/vulkan/v3dvx_pipeline.c | 5 ++++-
+ 2 files changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index ca5f676b6f7..30a9894789b 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -224,7 +224,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+       .logicOp = true,
+       .multiDrawIndirect = false,
+       .drawIndirectFirstInstance = true,
+-      .depthClamp = false, /* Only available since V3D 4.5.1.1 */
++      .depthClamp = physical_device->devinfo.ver >= 71,
+       .depthBiasClamp = true,
+       .fillModeNonSolid = true,
+       .depthBounds = physical_device->devinfo.ver >= 71,
+diff --git a/src/broadcom/vulkan/v3dvx_pipeline.c b/src/broadcom/vulkan/v3dvx_pipeline.c
+index c9b537f4b32..ad22add155d 100644
+--- a/src/broadcom/vulkan/v3dvx_pipeline.c
++++ b/src/broadcom/vulkan/v3dvx_pipeline.c
+@@ -243,6 +243,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+        * supported in the driver yet, so in practice we are always enabling Z
+        * clipping for now.
+        */
++      bool z_clamp_enable = rs_info && rs_info->depthClampEnable;
+       bool z_clip_enable = false;
+       const VkPipelineRasterizationDepthClipStateCreateInfoEXT *clip_info =
+          ds_info ? vk_find_struct_const(ds_info->pNext,
+@@ -250,7 +251,7 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+                    NULL;
+       if (clip_info)
+          z_clip_enable = clip_info->depthClipEnable;
+-      else if (!(rs_info && rs_info->depthClampEnable))
++      else if (!z_clamp_enable)
+          z_clip_enable = true;
+ 
+       if (z_clip_enable) {
+@@ -260,6 +261,8 @@ pack_cfg_bits(struct v3dv_pipeline *pipeline,
+          config.z_clipping_mode = V3D_Z_CLIP_MODE_NONE;
+       }
+ 
++      config.z_clamp_mode = z_clamp_enable;
++
+       config.depth_bounds_test_enable =
+               ds_info && ds_info->depthBoundsTestEnable && has_ds_attachment;
+ #endif
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch b/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch
new file mode 100644
index 0000000000..831de83810
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0135-v3dv-temporary-disable-EXT_acquire_drm_display.patch
@@ -0,0 +1,29 @@
+From 6bd92fecf57b5b1ae3f1f665726c4a0c43d3d90e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
+Date: Tue, 11 Apr 2023 13:11:39 +0200
+Subject: [PATCH 135/142] v3dv/temporary: disable EXT_acquire_drm_display
+
+So we could made a conformance run, without the need to include the
+CTS patch for this issue:
+
+https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/4377
+---
+ src/broadcom/vulkan/v3dv_device.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index 30a9894789b..c0ffc05750f 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -91,7 +91,7 @@ static const struct vk_instance_extension_table instance_extensions = {
+    .KHR_display                         = true,
+    .KHR_get_display_properties2         = true,
+    .EXT_direct_mode_display             = true,
+-   .EXT_acquire_drm_display             = true,
++   .EXT_acquire_drm_display             = false,
+ #endif
+    .KHR_external_fence_capabilities     = true,
+    .KHR_external_memory_capabilities    = true,
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch b/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch
new file mode 100644
index 0000000000..402eb77074
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0136-v3dv-expose-scalarBlockLayout-on-V3D-7.x.patch
@@ -0,0 +1,27 @@
+From 7960516490008ab42ab31e921369b1ffb8f67bde Mon Sep 17 00:00:00 2001
+From: Iago Toral Quiroga <itoral@igalia.com>
+Date: Wed, 21 Jun 2023 10:29:07 +0200
+Subject: [PATCH 136/142] v3dv: expose scalarBlockLayout on V3D 7.x
+
+This version of V3D doesn't have the restriction that vector accesses
+must not cross 16-byte boundaries.
+---
+ src/broadcom/vulkan/v3dv_device.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c
+index c0ffc05750f..8f8102ae46e 100644
+--- a/src/broadcom/vulkan/v3dv_device.c
++++ b/src/broadcom/vulkan/v3dv_device.c
+@@ -304,7 +304,7 @@ get_features(const struct v3dv_physical_device *physical_device,
+        * problematic, we would always have to scalarize. Overall, this would
+        * not lead to best performance so let's just not support it.
+        */
+-      .scalarBlockLayout = false,
++      .scalarBlockLayout = physical_device->devinfo.ver >= 71,
+       /* This tells applications 2 things:
+        *
+        * 1. If they can select just one aspect for barriers. For us barriers
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch b/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch
new file mode 100644
index 0000000000..5ff628c96d
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0137-dri-Limit-the-max_num_back-to-2-on-COMPLETE_MODE_FLI.patch
@@ -0,0 +1,42 @@
+From b58e1d7fd1c315e6ada0ad9ec4961b65c88f0c2a Mon Sep 17 00:00:00 2001
+From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+Date: Mon, 4 Oct 2021 14:30:30 +0200
+Subject: [PATCH 137/142] dri: Limit the max_num_back to 2 on
+ COMPLETE_MODE_FLIP present mode
+
+This is limiting the number of back buffers that mesa can allocate, so
+this avoids triple buffering, although that is desirable in some cases.
+
+To get this to upstream, we could convert it to a DRI option
+and enable it only in the case of using mutter.
+It seems to be feasible to limit this to some kind of configuration, as
+we have access to the size of the back-buffer allocated. For example,
+only limit for 4k-dual screen setup.
+
+With this Raspberry OS start-up CMA usage is 210Mb with 4k-dual screen
+setup instead of 276Mb.
+
+The correct approach would be to check if we can make Mutter to wait
+for buffer swaps before starting a new frame.
+
+https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7033
+---
+ src/loader/loader_dri3_helper.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
+index 32135770e9d..2534c817dcc 100644
+--- a/src/loader/loader_dri3_helper.c
++++ b/src/loader/loader_dri3_helper.c
+@@ -275,7 +275,7 @@ dri3_update_max_num_back(struct loader_dri3_drawable *draw)
+       if (draw->swap_interval == 0)
+          draw->max_num_back = 4;
+       else
+-         draw->max_num_back = 3;
++         draw->max_num_back = 2;
+ 
+       assert(draw->max_num_back <= LOADER_DRI3_MAX_BACK);
+       break;
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch b/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch
new file mode 100644
index 0000000000..d1504ba496
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0138-v3d-Ignore-SCANOUT-usage-flags-when-not-needed-under.patch
@@ -0,0 +1,369 @@
+From d0f2a99045fa9835fea822ada58a344e2fdc1b13 Mon Sep 17 00:00:00 2001
+From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+Date: Thu, 21 Oct 2021 22:04:57 +0200
+Subject: [PATCH 138/142] v3d: Ignore SCANOUT usage flags when not needed under
+ X
+
+These downstream patches force the usage of tiled formats
+when possible, they have been tested for the Rasbperry Pi OS
+desktop enviroment using Mutter+Xserver.
+
+It includes the following 3 patches:
+ - v3d: Add driconf options to rewrite SCANOUT usages
+ - v3d: Check if are under X session
+ - v3d: enable options to ignore SCANOUT flag on resource creation
+
+v3d: Add driconf options to rewrite SCANOUT usages
+
+We create a new eviroment variable V3D_IGNORE_SCANOUT_USAGES
+that will affect v3d_resource_create_with_modifiers so
+SCANOUT usages can be ignored. It can be enabled under X11
+with a compositor so applications are forces to use tiled render
+buffers instead of the default behaviour that uses SCANOUT and
+consume the limited CMA memory in the RPi4.
+
+The two new driconf options modulate the effect on two applications
+Xorg and mutter.
+
+"v3d_maintain_ignorable_scanout": is enabled in mutter, could be used
+in other compositors, the objective is that the enviroment has enable
+the V3D_IGNORE_SCANOUT_USAGES, they aren't ignored in the compositor.
+
+"v3d_is_xserver_process": is used to handle a particular case
+to avoid checking if an Xserver connection is available using XCB
+as in some cases the call stalls the Xserver on boot.
+
+Following patches will use this configuration options to ignore or not
+the SCANOUT usage on v3d_resource_allocation with modifiers.
+
+Upstreaming this patch need to review the effects of:
+ ad50b47a14e9 ("gbm: assume USE_SCANOUT in create_with_modifiers")
+
+v2: driconf for v3d_is_xserver_process is needed under XWayland
+    to avoid XCB connections in the XWayland process.
+
+v3d: Check if are under X session
+
+If we are using Wayland + XWayland, this is considered *not* being under
+X session.
+
+v3d: enable options to ignore SCANOUT flag on resource creation
+
+This is a downstream patch for enabling the usage of more tiled
+buffers in Raspberry OS under an enviroment using mutter and Xorg.
+
+This patch enables the following behaviour in order to reduce the
+number of CMA usage and use tiled layouts because we ignore
+the possible SCANOUT usage of the resource.
+
+This patch makes mutter to not ignore SCANOUT flags because as
+compositor it should allocate linear render buffers suitable for display.
+
+Then if the Xserver has enabled the dmabuf_capable option, the
+buffers backing the windows pixmaps will allocate using modifiers,
+in the patched Xserver downstream making pixmaps exportable will use
+gbm_gbm_bo_create_with_modifiers2 that does not add the SCANOUT flag
+for exporting pixmaps. With the Mutter compositor we didn't find a
+situation were this pixmaps needed to be SCANOUT. But this is not sure,
+but it allows us to not use CMA for every window opened, and having them
+in tiled format saves all linear->tiled conversion for sampling.
+
+Finally to take advantage of using Tiled render buffers for applications
+we can enable in the enviroment V3D_IGNORE_SCANOUT_USAGES so all render
+targes use the tiled UIF format without CMA memory instead of a linear one.
+As the compositor mutter will composite the final surface for display we
+aren't going to use the SCANOUT flag. This only applies if we are under
+an X11 session.
+
+v2: v3d: ignore V3D_IGNORE_SCANOUT if only LINEAR modifier available
+    This is a fixup for the behaviour of ignoring SCANOUT flags
+    so we don't allocate CMA memory on V3D for render targets under
+    X11 as UIF isn't included and only LINEAR is a valid modifier
+    when Xserver is using msdri3. So we cannot ignore the SCANOUT flag.
+    As the Xserver in this situation is limiting the available modifiers
+    to linear, we can identify this case just not ignoring the SCANOUT
+    flag when we can only allocate linear resources.
+---
+ src/gallium/drivers/v3d/driinfo_v3d.h  |  2 +
+ src/gallium/drivers/v3d/meson.build    | 17 +++++---
+ src/gallium/drivers/v3d/v3d_resource.c | 31 ++++++++++++--
+ src/gallium/drivers/v3d/v3d_screen.c   | 59 ++++++++++++++++++++++++++
+ src/gallium/drivers/v3d/v3d_screen.h   |  6 +++
+ src/util/00-mesa-defaults.conf         |  3 ++
+ src/util/driconf.h                     |  8 ++++
+ 7 files changed, 117 insertions(+), 9 deletions(-)
+
+diff --git a/src/gallium/drivers/v3d/driinfo_v3d.h b/src/gallium/drivers/v3d/driinfo_v3d.h
+index 147ad0b49bd..8f989e8aa57 100644
+--- a/src/gallium/drivers/v3d/driinfo_v3d.h
++++ b/src/gallium/drivers/v3d/driinfo_v3d.h
+@@ -2,4 +2,6 @@
+ 
+ DRI_CONF_SECTION_MISCELLANEOUS
+    DRI_CONF_V3D_NONMSAA_TEXTURE_SIZE_LIMIT(false)
++   DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(false)
++   DRI_CONF_V3D_IS_XSERVER_PROCESS(false)
+ DRI_CONF_SECTION_END
+diff --git a/src/gallium/drivers/v3d/meson.build b/src/gallium/drivers/v3d/meson.build
+index 289473d2ca1..e47682db1aa 100644
+--- a/src/gallium/drivers/v3d/meson.build
++++ b/src/gallium/drivers/v3d/meson.build
+@@ -61,6 +61,16 @@ endif
+ 
+ v3d_versions = ['33', '42', '71']
+ 
++v3d_deps = [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers]
++
++if with_platform_x11
++  v3d_deps += dep_xcb
++endif
++
++if with_platform_wayland
++  v3d_deps += dep_wayland_client
++endif
++
+ per_version_libs = []
+ foreach ver : v3d_versions
+   per_version_libs += static_library(
+@@ -72,7 +82,7 @@ foreach ver : v3d_versions
+     ],
+     c_args : [v3d_args, '-DV3D_VERSION=' + ver],
+     gnu_symbol_visibility : 'hidden',
+-    dependencies : [dep_v3dv3, dep_libdrm, dep_valgrind, idep_nir_headers],
++    dependencies : v3d_deps,
+ )
+ 
+ endforeach
+@@ -95,10 +105,7 @@ libv3d = static_library(
+   c_args : [v3d_args],
+   cpp_args : [v3d_args],
+   gnu_symbol_visibility : 'hidden',
+-  dependencies : [
+-    dep_v3dv3, dep_libdrm, dep_valgrind,
+-    idep_nir_headers, idep_mesautil,
+-  ],
++  dependencies : v3d_deps + idep_mesautil,
+   link_with: [per_version_libs],
+ )
+ 
+diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
+index a0a210ccad5..46de1b16ae0 100644
+--- a/src/gallium/drivers/v3d/v3d_resource.c
++++ b/src/gallium/drivers/v3d/v3d_resource.c
+@@ -439,7 +439,7 @@ v3d_resource_get_handle(struct pipe_screen *pscreen,
+         case WINSYS_HANDLE_TYPE_SHARED:
+                 return v3d_bo_flink(bo, &whandle->handle);
+         case WINSYS_HANDLE_TYPE_KMS:
+-                if (screen->ro) {
++                if (screen->ro && rsc->scanout) {
+                         if (renderonly_get_handle(rsc->scanout, whandle)) {
+                                 whandle->stride = rsc->slices[0].stride;
+                                 return true;
+@@ -785,6 +785,27 @@ v3d_resource_setup(struct pipe_screen *pscreen,
+         return rsc;
+ }
+ 
++static bool
++v3d_resource_should_scanout(struct pipe_screen *pscreen,
++                            const struct pipe_resource *tmpl,
++                            const uint64_t *modifiers,
++                            int count)
++{
++        struct v3d_screen *screen = v3d_screen(pscreen);
++
++        if (tmpl->bind & PIPE_BIND_SCANOUT) {
++                if (screen->maintain_ignorable_scanout)
++                        return true;
++                if (screen->has_x_session && screen->ignore_scanout_usages) {
++                        if (drm_find_modifier(DRM_FORMAT_MOD_BROADCOM_UIF,
++                                              modifiers, count))
++                                return false;
++                }
++                return true;
++        }
++        return false;
++}
++
+ static struct pipe_resource *
+ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
+                                    const struct pipe_resource *tmpl,
+@@ -798,6 +819,8 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
+         struct pipe_resource *prsc = &rsc->base;
+         /* Use a tiled layout if we can, for better 3D performance. */
+         bool should_tile = true;
++        bool should_scanout = v3d_resource_should_scanout(pscreen, tmpl,
++                                                          modifiers, count);
+ 
+         assert(tmpl->target != PIPE_BUFFER ||
+                (tmpl->format == PIPE_FORMAT_NONE ||
+@@ -827,7 +850,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
+         /* If using the old-school SCANOUT flag, we don't know what the screen
+          * might support other than linear. Just force linear.
+          */
+-        if (tmpl->bind & PIPE_BIND_SCANOUT)
++        if ((tmpl->bind & PIPE_BIND_SCANOUT) && should_scanout)
+                 should_tile = false;
+ 
+         /* No user-specified modifier; determine our own. */
+@@ -849,7 +872,7 @@ v3d_resource_create_with_modifiers(struct pipe_screen *pscreen,
+ 
+         v3d_setup_slices(rsc, 0, tmpl->bind & PIPE_BIND_SHARED);
+ 
+-        if (screen->ro && (tmpl->bind & PIPE_BIND_SCANOUT)) {
++        if (screen->ro && should_scanout) {
+                 struct winsys_handle handle;
+                 struct pipe_resource scanout_tmpl = {
+                         .target = prsc->target,
+@@ -979,7 +1002,7 @@ v3d_resource_from_handle(struct pipe_screen *pscreen,
+                  }
+         }
+ 
+-        if (screen->ro) {
++        if (screen->ro && !rsc->tiled) {
+                 /* Make sure that renderonly has a handle to our buffer in the
+                  * display's fd, so that a later renderonly_get_handle()
+                  * returns correct handles or GEM names.
+diff --git a/src/gallium/drivers/v3d/v3d_screen.c b/src/gallium/drivers/v3d/v3d_screen.c
+index 2225edf85bd..1d4f619d710 100644
+--- a/src/gallium/drivers/v3d/v3d_screen.c
++++ b/src/gallium/drivers/v3d/v3d_screen.c
+@@ -47,6 +47,42 @@
+ #include "compiler/v3d_compiler.h"
+ #include "drm-uapi/drm_fourcc.h"
+ 
++#ifdef HAVE_WAYLAND_PLATFORM
++#include <wayland-client.h>
++#endif
++
++#ifdef HAVE_X11_PLATFORM
++#include <xcb/xcb.h>
++#endif
++
++static bool
++check_x_session()
++{
++        bool xcb_connection = false;
++
++#ifdef HAVE_WAYLAND_PLATFORM
++        struct wl_display *display;
++
++        display = wl_display_connect(NULL);
++
++        if (display) {
++                wl_display_disconnect(display);
++                return xcb_connection;
++        }
++#endif
++
++#ifdef HAVE_X11_PLATFORM
++        xcb_connection_t *conn;
++
++        conn = xcb_connect(NULL, NULL);
++
++        if (!xcb_connection_has_error(conn))
++                xcb_connection = true;
++        xcb_disconnect(conn);
++#endif
++        return xcb_connection;
++}
++
+ static const char *
+ v3d_screen_get_name(struct pipe_screen *pscreen)
+ {
+@@ -945,6 +981,29 @@ v3d_screen_create(int fd, const struct pipe_screen_config *config,
+                 v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH);
+         screen->has_perfmon = v3d_has_feature(screen, DRM_V3D_PARAM_SUPPORTS_PERFMON);
+ 
++        screen->ignore_scanout_usages = getenv("V3D_IGNORE_SCANOUT_USAGES");
++
++        const char *is_xserver_process =
++                "v3d_is_xserver_process";
++        screen->is_xserver_process =
++                driCheckOption(config->options,
++                               is_xserver_process,
++                               DRI_BOOL) &&
++                driQueryOptionb(config->options,
++                                is_xserver_process);
++
++        const char *maintain_ignorable_scanout_name =
++                "v3d_maintain_ignorable_scanout";
++        screen->maintain_ignorable_scanout =
++                driCheckOption(config->options,
++                               maintain_ignorable_scanout_name,
++                               DRI_BOOL) &&
++                driQueryOptionb(config->options,
++                                maintain_ignorable_scanout_name);
++
++        screen->has_x_session = !screen->is_xserver_process &&
++                                check_x_session();
++
+         v3d_fence_init(screen);
+ 
+         v3d_process_debug_variable();
+diff --git a/src/gallium/drivers/v3d/v3d_screen.h b/src/gallium/drivers/v3d/v3d_screen.h
+index 1da9b83c965..c0f22707075 100644
+--- a/src/gallium/drivers/v3d/v3d_screen.h
++++ b/src/gallium/drivers/v3d/v3d_screen.h
+@@ -83,6 +83,12 @@ struct v3d_screen {
+         bool has_cache_flush;
+         bool has_perfmon;
+         bool nonmsaa_texture_size_limit;
++        bool ignore_scanout_usages;
++        bool is_xserver_process;
++        bool maintain_ignorable_scanout;
++
++        /* Are we running in an X session? */
++        bool has_x_session;
+ 
+         struct v3d_simulator_file *sim_file;
+ 
+diff --git a/src/util/00-mesa-defaults.conf b/src/util/00-mesa-defaults.conf
+index 948c1ef78ba..2de7505521c 100644
+--- a/src/util/00-mesa-defaults.conf
++++ b/src/util/00-mesa-defaults.conf
+@@ -77,6 +77,7 @@ TODO: document the other workarounds.
+         <!-- using vulkan wsi for xservers causes deadlocks -->
+         <application name="Xwayland" executable="Xwayland">
+             <option name="disable_xcb_surface" value="true" />
++            <option name="v3d_is_xserver_process" value="true" />
+         </application>
+ 
+         <application name="Unigine Heaven (32-bit)" executable="heaven_x86">
+@@ -767,6 +768,7 @@ TODO: document the other workarounds.
+         <application name="mutter" executable="mutter">
+             <option name="adaptive_sync" value="false" />
+             <option name="v3d_nonmsaa_texture_size_limit" value="true" />
++            <option name="v3d_maintain_ignorable_scanout" value="true" />
+         </application>
+         <application name="muffin" executable="muffin">
+             <option name="adaptive_sync" value="false" />
+@@ -818,6 +820,7 @@ TODO: document the other workarounds.
+         </application>
+         <application name="Xorg" executable="Xorg">
+             <option name="v3d_nonmsaa_texture_size_limit" value="true" />
++            <option name="v3d_is_xserver_process" value="true" />
+         </application>
+ 
+         <application name="gfxbench" executable="testfw_app">
+diff --git a/src/util/driconf.h b/src/util/driconf.h
+index 042ee27d9a3..56511f6615e 100644
+--- a/src/util/driconf.h
++++ b/src/util/driconf.h
+@@ -521,6 +521,14 @@
+    DRI_CONF_OPT_B(v3d_nonmsaa_texture_size_limit, def, \
+                   "Report the non-MSAA-only texture size limit")
+ 
++#define DRI_CONF_V3D_IS_XSERVER_PROCESS(def) \
++   DRI_CONF_OPT_B(v3d_is_xserver_process, def, \
++                  "Identifies if the application is the Xserver.")
++
++#define DRI_CONF_V3D_MAINTAIN_IGNORABLE_SCANOUT(def)   \
++   DRI_CONF_OPT_B(v3d_maintain_ignorable_scanout, def, \
++                  "Maintain SCANOUT usage on resource allocations when the environment allows ignoring SCANOUT usage.")
++
+ /**
+  * \brief virgl specific configuration options
+  */
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch b/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch
new file mode 100644
index 0000000000..a453a83892
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0139-Add-a-hack-to-avoid-the-shadow-tex-update-for-import.patch
@@ -0,0 +1,117 @@
+From fc1fe85f01a67ef6e5758f1022950ad79b1b305a Mon Sep 17 00:00:00 2001
+From: Neil Roberts <nroberts@igalia.com>
+Date: Mon, 5 Jul 2021 20:19:06 +0200
+Subject: [PATCH 139/142] Add a hack to avoid the shadow tex update for
+ imported linear texs
+
+This adds a hacky interface so that an application can override the
+mechanism used to detect when to update the shadow texture which is used
+when importing a linear texture. The application can enable this by
+calling:
+
+glTexParameteri(GL_TEXTURE_2D, GL_SYNC_CONDITION, 1);
+
+And then whenever it determines that the shadow texture should be
+updated it can call:
+
+glTexParameteri(GL_TEXTURE_2D, GL_SYNC_STATUS, 1);
+
+(cherry picked from commit 1269e2cfbfa876fdc85037b9435085174d76ad57)
+---
+ src/gallium/drivers/v3d/v3d_resource.c |  5 ++++-
+ src/gallium/include/pipe/p_state.h     |  4 ++++
+ src/mesa/main/mtypes.h                 |  3 +++
+ src/mesa/main/texparam.c               | 18 ++++++++++++++++++
+ 4 files changed, 29 insertions(+), 1 deletion(-)
+
+diff --git a/src/gallium/drivers/v3d/v3d_resource.c b/src/gallium/drivers/v3d/v3d_resource.c
+index 46de1b16ae0..8e31acb0ff0 100644
+--- a/src/gallium/drivers/v3d/v3d_resource.c
++++ b/src/gallium/drivers/v3d/v3d_resource.c
+@@ -1048,7 +1048,9 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
+ 
+         assert(view->texture != pview->texture);
+ 
+-        if (shadow->writes == orig->writes && orig->bo->private)
++        if (shadow->writes == orig->writes &&
++            orig->base.sync_status == 0 &&
++            (orig->bo->private || orig->base.sync_condition))
+                 return;
+ 
+         perf_debug("Updating %dx%d@%d shadow for linear texture\n",
+@@ -1091,6 +1093,7 @@ v3d_update_shadow_texture(struct pipe_context *pctx,
+         }
+ 
+         shadow->writes = orig->writes;
++        orig->base.sync_status = 0;
+ }
+ 
+ static struct pipe_surface *
+diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
+index 549e4d21c05..abc58552544 100644
+--- a/src/gallium/include/pipe/p_state.h
++++ b/src/gallium/include/pipe/p_state.h
+@@ -610,6 +610,10 @@ struct pipe_resource
+    unsigned bind;            /**< bitmask of PIPE_BIND_x */
+    unsigned flags;           /**< bitmask of PIPE_RESOURCE_FLAG_x */
+ 
++   /* Hack for avoiding sync on v3d */
++   unsigned sync_condition;
++   unsigned sync_status;
++
+    /**
+     * For planar images, ie. YUV EGLImage external, etc, pointer to the
+     * next plane.
+diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
+index 77c38bf48d5..1eb2dac8018 100644
+--- a/src/mesa/main/mtypes.h
++++ b/src/mesa/main/mtypes.h
+@@ -1058,6 +1058,9 @@ struct gl_texture_object
+      * the pipe_resource *pt above.
+      */
+     bool needs_validation;
++
++    /* Hack for avoiding sync on v3d */
++    GLboolean SyncCondition;
+ };
+ 
+ 
+diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
+index 001cc185722..139db3ce3e2 100644
+--- a/src/mesa/main/texparam.c
++++ b/src/mesa/main/texparam.c
+@@ -274,6 +274,13 @@ set_tex_parameteri(struct gl_context *ctx,
+    }
+ 
+    switch (pname) {
++   case GL_SYNC_CONDITION:
++      if (!!texObj->SyncCondition == !!params[0])
++         return GL_FALSE;
++      texObj->SyncCondition = !!params[0];
++      return GL_TRUE;
++   case GL_SYNC_STATUS:
++      return GL_TRUE;
+    case GL_TEXTURE_MIN_FILTER:
+       if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
+          goto invalid_dsa;
+@@ -931,6 +938,17 @@ _mesa_texture_parameter_invalidate(struct gl_context *ctx,
+ {
+    if (texparam_invalidates_sampler_views(pname))
+       st_texture_release_all_sampler_views(st_context(ctx), texObj);
++
++   switch (pname) {
++   case GL_SYNC_CONDITION:
++      texObj->pt->sync_condition = texObj->SyncCondition;
++      break;
++   case GL_SYNC_STATUS:
++      texObj->pt->sync_status = 1;
++      break;
++   default:
++      ; /* nothing */
++   }
+ }
+ 
+ void
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch b/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch
new file mode 100644
index 0000000000..1336841a6a
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0140-vc4-Fix-mask-RGBA-validation-at-YUV-blit.patch
@@ -0,0 +1,29 @@
+From 270deb428f1de371492a5e6185fe410c4329eab4 Mon Sep 17 00:00:00 2001
+From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+Date: Mon, 25 Sep 2023 21:16:59 +0200
+Subject: [PATCH 140/142] vc4: Fix mask RGBA validation at YUV blit
+
+Solves regression on video players using GPU for
+video decoding that just displays the video in green.
+
+Fixes: d13da7782cd80 ("vc4: call blit paths in chain")
+---
+ src/gallium/drivers/vc4/vc4_blit.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
+index 2cf65b5f585..87b2369b7ad 100644
+--- a/src/gallium/drivers/vc4/vc4_blit.c
++++ b/src/gallium/drivers/vc4/vc4_blit.c
+@@ -347,7 +347,7 @@ vc4_yuv_blit(struct pipe_context *pctx, struct pipe_blit_info *info)
+         struct vc4_resource *dst = vc4_resource(info->dst.resource);
+         bool ok;
+ 
+-        if (info->mask & PIPE_MASK_RGBA)
++        if (!(info->mask & PIPE_MASK_RGBA))
+                 return;
+ 
+         if (src->tiled)
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch b/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch
new file mode 100644
index 0000000000..e969ec933b
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0141-vc4-mark-buffers-as-initialized-at-vc4_texture_subda.patch
@@ -0,0 +1,175 @@
+From f843fbceb381f8c82074e8b68583fbfe57c48a6e Mon Sep 17 00:00:00 2001
+From: Jose Maria Casanova Crespo <jmcasanova@igalia.com>
+Date: Thu, 8 Jun 2023 00:57:15 +0200
+Subject: [PATCH 141/142] vc4: mark buffers as initialized at
+ vc4_texture_subdata
+
+This fixes several tests when the initially uploaded buffer
+from CPU was being ignored because vc4_texture_subdata was not
+marking the resource as written/initialized.
+
+The usage flags management available at vc4_resource_transfer_map
+is generalized into vc4_map_usage_prep and reused at
+vc4_resource_transfer_map. This makes vc4 implementation more similar
+to v3d.
+
+This fixes 7 text in the following subgroups:
+  -dEQP-GLES2.functional.fbo.render.texsubimage.*
+  -dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.*
+  -spec@arb_clear_texture@arb_clear_texture-*
+
+Cc: mesa-stable
+Reviewed-by: Juan A. Suarez <jasuarez@igalia.com>
+Reviewed-by: Emma Anholt <emma@anholt.net>
+Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25297>
+---
+ src/broadcom/ci/broadcom-rpi3-fails.txt | 11 ----
+ src/gallium/drivers/vc4/vc4_resource.c  | 71 +++++++++++++++----------
+ 2 files changed, 44 insertions(+), 38 deletions(-)
+
+diff --git a/src/broadcom/ci/broadcom-rpi3-fails.txt b/src/broadcom/ci/broadcom-rpi3-fails.txt
+index 5522310d91a..e49e77b1436 100644
+--- a/src/broadcom/ci/broadcom-rpi3-fails.txt
++++ b/src/broadcom/ci/broadcom-rpi3-fails.txt
+@@ -18,11 +18,6 @@ dEQP-GLES2.functional.clipping.line.wide_line_clip_viewport_corner,Fail
+ 
+ dEQP-GLES2.functional.depth_stencil_clear.depth_stencil_masked,Fail
+ 
+-# A glTexImage, glDraw, glTexSubImage sequence into a texture is missing what looks like the drawing.
+-dEQP-GLES2.functional.fbo.render.texsubimage.after_render_tex2d_rgba,Fail
+-# A glTexImage, glDraw, glTexSubImage, glDraw sequence into a texture is missing what looks like the first drawing.
+-dEQP-GLES2.functional.fbo.render.texsubimage.between_render_tex2d_rgba,Fail
+-
+ # Sampling grid slightly off in test 2?
+ dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_mirror_rgba8888,Fail
+ dEQP-GLES2.functional.texture.filtering.2d.nearest_mipmap_linear_linear_repeat_rgba8888,Fail
+@@ -38,12 +33,6 @@ dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_clamp_non_square,Fa
+ dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_mirror_non_square,Fail
+ dEQP-GLES2.functional.texture.mipmap.2d.basic.nearest_linear_repeat_non_square,Fail
+ 
+-# Sequence of glTexImage, glDraw, glCopyTexSubImage.
+-# background red/green checkerboard on the left side is incorrectly white.
+-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.2d_rgba,Fail
+-# Maybe it was copied as RGB instead of RGBA?
+-dEQP-GLES2.functional.texture.specification.basic_copytexsubimage2d.cube_rgba,Fail
+-
+ # One of the pixels on the left edge near the bottom is wrong for both min and
+ # mag.  Also a line of pixels through the image in minification.
+ dEQP-GLES2.functional.texture.wrap.clamp_clamp_nearest_npot_etc1,Fail
+diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
+index ad2791aa972..0a3a435a46c 100644
+--- a/src/gallium/drivers/vc4/vc4_resource.c
++++ b/src/gallium/drivers/vc4/vc4_resource.c
+@@ -95,34 +95,13 @@ vc4_resource_transfer_unmap(struct pipe_context *pctx,
+         slab_free(&vc4->transfer_pool, ptrans);
+ }
+ 
+-static void *
+-vc4_resource_transfer_map(struct pipe_context *pctx,
+-                          struct pipe_resource *prsc,
+-                          unsigned level, unsigned usage,
+-                          const struct pipe_box *box,
+-                          struct pipe_transfer **pptrans)
++static void
++vc4_map_usage_prep(struct pipe_context *pctx,
++                   struct pipe_resource *prsc,
++                   unsigned usage)
+ {
+         struct vc4_context *vc4 = vc4_context(pctx);
+         struct vc4_resource *rsc = vc4_resource(prsc);
+-        struct vc4_transfer *trans;
+-        struct pipe_transfer *ptrans;
+-        enum pipe_format format = prsc->format;
+-        char *buf;
+-
+-        /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is
+-         * being mapped.
+-         */
+-        if ((usage & PIPE_MAP_DISCARD_RANGE) &&
+-            !(usage & PIPE_MAP_UNSYNCHRONIZED) &&
+-            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
+-            prsc->last_level == 0 &&
+-            prsc->width0 == box->width &&
+-            prsc->height0 == box->height &&
+-            prsc->depth0 == box->depth &&
+-            prsc->array_size == 1 &&
+-            rsc->bo->private) {
+-                usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
+-        }
+ 
+         if (usage & PIPE_MAP_DISCARD_WHOLE_RESOURCE) {
+                 if (vc4_resource_bo_alloc(rsc)) {
+@@ -131,6 +110,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
+                          */
+                         if (prsc->bind & PIPE_BIND_VERTEX_BUFFER)
+                                 vc4->dirty |= VC4_DIRTY_VTXBUF;
++                        if (prsc->bind & PIPE_BIND_CONSTANT_BUFFER)
++                                vc4->dirty |= VC4_DIRTY_CONSTBUF;
+                 } else {
+                         /* If we failed to reallocate, flush users so that we
+                          * don't violate any syncing requirements.
+@@ -139,7 +120,7 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
+                 }
+         } else if (!(usage & PIPE_MAP_UNSYNCHRONIZED)) {
+                 /* If we're writing and the buffer is being used by the CL, we
+-                 * have to flush the CL first.  If we're only reading, we need
++                 * have to flush the CL first. If we're only reading, we need
+                  * to flush if the CL has written our buffer.
+                  */
+                 if (usage & PIPE_MAP_WRITE)
+@@ -152,6 +133,38 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
+                 rsc->writes++;
+                 rsc->initialized_buffers = ~0;
+         }
++}
++
++static void *
++vc4_resource_transfer_map(struct pipe_context *pctx,
++                          struct pipe_resource *prsc,
++                          unsigned level, unsigned usage,
++                          const struct pipe_box *box,
++                          struct pipe_transfer **pptrans)
++{
++        struct vc4_context *vc4 = vc4_context(pctx);
++        struct vc4_resource *rsc = vc4_resource(prsc);
++        struct vc4_transfer *trans;
++        struct pipe_transfer *ptrans;
++        enum pipe_format format = prsc->format;
++        char *buf;
++
++        /* Upgrade DISCARD_RANGE to WHOLE_RESOURCE if the whole resource is
++         * being mapped.
++         */
++        if ((usage & PIPE_MAP_DISCARD_RANGE) &&
++            !(usage & PIPE_MAP_UNSYNCHRONIZED) &&
++            !(prsc->flags & PIPE_RESOURCE_FLAG_MAP_PERSISTENT) &&
++            prsc->last_level == 0 &&
++            prsc->width0 == box->width &&
++            prsc->height0 == box->height &&
++            prsc->depth0 == box->depth &&
++            prsc->array_size == 1 &&
++            rsc->bo->private) {
++                usage |= PIPE_MAP_DISCARD_WHOLE_RESOURCE;
++        }
++
++        vc4_map_usage_prep(pctx, prsc, usage);
+ 
+         trans = slab_zalloc(&vc4->transfer_pool);
+         if (!trans)
+@@ -240,8 +253,12 @@ vc4_texture_subdata(struct pipe_context *pctx,
+         }
+ 
+         /* Otherwise, map and store the texture data directly into the tiled
+-         * texture.
++         * texture.  Note that gallium's texture_subdata may be called with
++         * obvious usage flags missing!
+          */
++        vc4_map_usage_prep(pctx, prsc, usage | (PIPE_MAP_WRITE |
++                                                PIPE_MAP_DISCARD_RANGE));
++
+         void *buf;
+         if (usage & PIPE_MAP_UNSYNCHRONIZED)
+                 buf = vc4_bo_map_unsynchronized(rsc->bo);
+-- 
+2.39.2
+
diff --git a/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch b/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch
new file mode 100644
index 0000000000..4055fc4658
--- /dev/null
+++ b/projects/RPi/devices/RPi5/patches/mesa/0142-gallium-Add-kmsro-drivers-for-RP1-DSI-DPI-and-VEC-de.patch
@@ -0,0 +1,43 @@
+From 3322c102282cf726ae575b122358060abd5b24db Mon Sep 17 00:00:00 2001
+From: Dave Stevenson <dave.stevenson@raspberrypi.com>
+Date: Thu, 5 Oct 2023 19:32:10 +0100
+Subject: [PATCH 142/142] gallium: Add kmsro drivers for RP1 DSI, DPI, and VEC
+ devices
+
+Signed-off-by: Dave Stevenson <dave.stevenson@raspberrypi.com>
+---
+ src/gallium/targets/dri/meson.build | 3 +++
+ src/gallium/targets/dri/target.c    | 3 +++
+ 2 files changed, 6 insertions(+)
+
+diff --git a/src/gallium/targets/dri/meson.build b/src/gallium/targets/dri/meson.build
+index fbec1da957b..59daf3b6fb6 100644
+--- a/src/gallium/targets/dri/meson.build
++++ b/src/gallium/targets/dri/meson.build
+@@ -68,6 +68,9 @@ libgallium_dri = shared_library(
+ 
+ foreach d : [[with_gallium_kmsro, [
+                'armada-drm_dri.so',
++               'drm-rp1-dpi_dri.so',
++               'drm-rp1-dsi_dri.so',
++               'drm-rp1-vec_dri.so',
+                'exynos_dri.so',
+                'hx8357d_dri.so',
+                'ili9225_dri.so',
+diff --git a/src/gallium/targets/dri/target.c b/src/gallium/targets/dri/target.c
+index d506869cbb4..ecb25edd03b 100644
+--- a/src/gallium/targets/dri/target.c
++++ b/src/gallium/targets/dri/target.c
+@@ -98,6 +98,9 @@ DEFINE_LOADER_DRM_ENTRYPOINT(tegra);
+ 
+ #if defined(GALLIUM_KMSRO)
+ DEFINE_LOADER_DRM_ENTRYPOINT(armada_drm)
++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_dpi)
++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_dsi)
++DEFINE_LOADER_DRM_ENTRYPOINT(drm_rp1_vec)
+ DEFINE_LOADER_DRM_ENTRYPOINT(exynos)
+ DEFINE_LOADER_DRM_ENTRYPOINT(hx8357d)
+ DEFINE_LOADER_DRM_ENTRYPOINT(ili9225)
+-- 
+2.39.2
+
diff --git a/projects/RPi/options b/projects/RPi/options
index 6088a1d474..d12ddc4b2f 100644
--- a/projects/RPi/options
+++ b/projects/RPi/options
@@ -77,6 +77,9 @@
   # default:  default mainline kernel
     LINUX="raspberrypi"
 
+  # use framebuffer console
+    EXTRA_CMDLINE="console=tty0"
+
 ################################################################################
 # setup build defaults
 ################################################################################
diff --git a/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf b/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf
index e7f96e9f50..cd28869656 100644
--- a/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf
+++ b/projects/Rockchip/devices/RK3288/linux/default/linux.arm.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm 6.1.0-rc6 Kernel Configuration
+# Linux/arm 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="armv7ve-libreelec-linux-gnueabihf-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -824,6 +824,7 @@ CONFIG_KMAP_LOCAL_NON_LINEAR_PTE_ARRAY=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1715,7 +1716,7 @@ CONFIG_SCSI_PROC_FS=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -3638,9 +3639,7 @@ CONFIG_MEDIA_ATTACH=y
 #
 CONFIG_VIDEO_IR_I2C=y
 
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -3707,7 +3706,6 @@ CONFIG_VIDEO_OV7640=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -6400,7 +6398,7 @@ CONFIG_CIFS_DEBUG=y
 CONFIG_CIFS_FSCACHE=y
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf b/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf
index 3357ef60ea..dbbca81906 100644
--- a/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf
+++ b/projects/Rockchip/devices/RK3328/linux/default/linux.aarch64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm64 6.1.0-rc6 Kernel Configuration
+# Linux/arm64 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -330,6 +330,7 @@ CONFIG_ARCH_ROCKCHIP=y
 #
 # ARM errata workarounds via the alternatives framework
 #
+# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set
 CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
 CONFIG_ARM64_ERRATUM_826319=y
 CONFIG_ARM64_ERRATUM_827319=y
@@ -356,6 +357,7 @@ CONFIG_ARM64_LD_HAS_FIX_ERRATUM_843419=y
 # CONFIG_ARM64_ERRATUM_2054223 is not set
 # CONFIG_ARM64_ERRATUM_2067961 is not set
 # CONFIG_ARM64_ERRATUM_2441009 is not set
+# CONFIG_ARM64_ERRATUM_2966298 is not set
 # CONFIG_CAVIUM_ERRATUM_22375 is not set
 # CONFIG_CAVIUM_ERRATUM_23154 is not set
 # CONFIG_CAVIUM_ERRATUM_27456 is not set
@@ -905,6 +907,7 @@ CONFIG_SECRETMEM=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1756,7 +1759,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -3534,9 +3537,7 @@ CONFIG_MEDIA_ATTACH=y
 #
 CONFIG_VIDEO_IR_I2C=y
 
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -3603,7 +3604,6 @@ CONFIG_VIDEO_OV7640=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -6228,7 +6228,7 @@ CONFIG_CIFS_DEBUG=y
 CONFIG_CIFS_FSCACHE=y
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf b/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf
index 1f3b91f485..4b0fcb72a7 100644
--- a/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf
+++ b/projects/Rockchip/devices/RK3399/linux/default/linux.aarch64.conf
@@ -1,6 +1,6 @@
 #
 # Automatically generated file; DO NOT EDIT.
-# Linux/arm64 6.1.0-rc6 Kernel Configuration
+# Linux/arm64 6.1.57 Kernel Configuration
 #
 CONFIG_CC_VERSION_TEXT="aarch64-none-elf-gcc-12.2.0 (GCC) 12.2.0"
 CONFIG_CC_IS_GCC=y
@@ -329,6 +329,7 @@ CONFIG_ARCH_ROCKCHIP=y
 #
 # ARM errata workarounds via the alternatives framework
 #
+# CONFIG_AMPERE_ERRATUM_AC03_CPU_38 is not set
 CONFIG_ARM64_WORKAROUND_CLEAN_CACHE=y
 CONFIG_ARM64_ERRATUM_826319=y
 CONFIG_ARM64_ERRATUM_827319=y
@@ -356,6 +357,7 @@ CONFIG_ARM64_ERRATUM_1319367=y
 # CONFIG_ARM64_ERRATUM_2054223 is not set
 # CONFIG_ARM64_ERRATUM_2067961 is not set
 # CONFIG_ARM64_ERRATUM_2441009 is not set
+# CONFIG_ARM64_ERRATUM_2966298 is not set
 # CONFIG_CAVIUM_ERRATUM_22375 is not set
 # CONFIG_CAVIUM_ERRATUM_23154 is not set
 # CONFIG_CAVIUM_ERRATUM_27456 is not set
@@ -906,6 +908,7 @@ CONFIG_SECRETMEM=y
 # CONFIG_ANON_VMA_NAME is not set
 # CONFIG_USERFAULTFD is not set
 # CONFIG_LRU_GEN is not set
+CONFIG_LOCK_MM_AND_FIND_VMA=y
 
 #
 # Data Access Monitoring
@@ -1862,7 +1865,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
@@ -4070,9 +4073,7 @@ CONFIG_MEDIA_ATTACH=y
 #
 CONFIG_VIDEO_IR_I2C=y
 
-#
-# Camera sensor devices
-#
+CONFIG_VIDEO_CAMERA_SENSOR=y
 # CONFIG_VIDEO_AR0521 is not set
 # CONFIG_VIDEO_HI556 is not set
 # CONFIG_VIDEO_HI846 is not set
@@ -4139,7 +4140,6 @@ CONFIG_VIDEO_OV7640=m
 # CONFIG_VIDEO_CCS is not set
 # CONFIG_VIDEO_ET8EK8 is not set
 # CONFIG_VIDEO_M5MOLS is not set
-# end of Camera sensor devices
 
 #
 # Lens drivers
@@ -7009,7 +7009,7 @@ CONFIG_CIFS_DEBUG=y
 CONFIG_CIFS_FSCACHE=y
 # CONFIG_CIFS_ROOT is not set
 # CONFIG_SMB_SERVER is not set
-CONFIG_SMBFS_COMMON=y
+CONFIG_SMBFS=y
 # CONFIG_CODA_FS is not set
 # CONFIG_AFS_FS is not set
 CONFIG_NLS=y
diff --git a/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch b/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch
index 21da17d29d..de24c472b0 100644
--- a/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch
+++ b/projects/Rockchip/patches/linux/default/linux-0002-rockchip-from-list.patch
@@ -1,78 +1,3 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Sat, 10 Oct 2020 15:32:18 +0000
-Subject: [PATCH] phy/rockchip: inno-hdmi: use correct vco_div_5 macro on
- rk3328
-
-inno_hdmi_phy_rk3328_clk_set_rate() is using the RK3228 macro
-when configuring vco_div_5 on RK3328.
-
-Fix this by using correct vco_div_5 macro for RK3328.
-
-Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy")
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-index 80acca4e9e14..15339338aae3 100644
---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-@@ -790,8 +790,8 @@ static int inno_hdmi_phy_rk3328_clk_set_rate(struct clk_hw *hw,
- 			 RK3328_PRE_PLL_POWER_DOWN);
- 
- 	/* Configure pre-pll */
--	inno_update_bits(inno, 0xa0, RK3228_PCLK_VCO_DIV_5_MASK,
--			 RK3228_PCLK_VCO_DIV_5(cfg->vco_div_5_en));
-+	inno_update_bits(inno, 0xa0, RK3328_PCLK_VCO_DIV_5_MASK,
-+			 RK3328_PCLK_VCO_DIV_5(cfg->vco_div_5_en));
- 	inno_write(inno, 0xa1, RK3328_PRE_PLL_PRE_DIV(cfg->prediv));
- 
- 	val = RK3328_SPREAD_SPECTRUM_MOD_DISABLE;
-
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Zheng Yang <zhengyang@rock-chips.com>
-Date: Sat, 10 Oct 2020 15:32:18 +0000
-Subject: [PATCH] phy/rockchip: inno-hdmi: round fractal pixclock in rk3328
- recalc_rate
-
-inno_hdmi_phy_rk3328_clk_recalc_rate() is returning a rate not found
-in the pre pll config table when the fractal divider is used.
-This can prevent proper power_on because a tmdsclock for the new rate
-is not found in the pre pll config table.
-
-Fix this by saving and returning a rounded pixel rate that exist
-in the pre pll config table.
-
-Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy")
-Signed-off-by: Zheng Yang <zhengyang@rock-chips.com>
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 8 +++++---
- 1 file changed, 5 insertions(+), 3 deletions(-)
-
-diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-index 15339338aae3..15a008a1ac7b 100644
---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-@@ -745,10 +745,12 @@ unsigned long inno_hdmi_phy_rk3328_clk_recalc_rate(struct clk_hw *hw,
- 		do_div(vco, (nd * (no_a == 1 ? no_b : no_a) * no_d * 2));
- 	}
- 
--	inno->pixclock = vco;
--	dev_dbg(inno->dev, "%s rate %lu\n", __func__, inno->pixclock);
-+	inno->pixclock = DIV_ROUND_CLOSEST((unsigned long)vco, 1000) * 1000;
- 
--	return vco;
-+	dev_dbg(inno->dev, "%s rate %lu vco %llu\n",
-+		__func__, inno->pixclock, vco);
-+
-+	return inno->pixclock;
- }
- 
- static long inno_hdmi_phy_rk3328_clk_round_rate(struct clk_hw *hw,
-
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Jonas Karlman <jonas@kwiboo.se>
 Date: Sat, 10 Oct 2020 15:32:19 +0000
@@ -110,53 +35,6 @@ index 15a008a1ac7b..4b936ca19920 100644
  
  		do_div(vco, (nd * (no_a == 1 ? no_b : no_a) * no_d * 2));
 
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jonas Karlman <jonas@kwiboo.se>
-Date: Sat, 10 Oct 2020 15:32:19 +0000
-Subject: [PATCH] phy/rockchip: inno-hdmi: do not power on rk3328 post pll on
- reg write
-
-inno_write is used to configure 0xaa reg, that also hold the
-POST_PLL_POWER_DOWN bit.
-When POST_PLL_REFCLK_SEL_TMDS is configured the power down bit is not
-taken into consideration.
-
-Fix this by keeping the power down bit until configuration is complete.
-Also reorder the reg write order for consistency.
-
-Fixes: 53706a116863 ("phy: add Rockchip Innosilicon hdmi phy")
-Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
----
- drivers/phy/rockchip/phy-rockchip-inno-hdmi.c | 6 ++++--
- 1 file changed, 4 insertions(+), 2 deletions(-)
-
-diff --git a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-index 4b936ca19920..620961fcfc1d 100644
---- a/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-+++ b/drivers/phy/rockchip/phy-rockchip-inno-hdmi.c
-@@ -1020,9 +1020,10 @@ inno_hdmi_phy_rk3328_power_on(struct inno_hdmi_phy *inno,
- 
- 	inno_write(inno, 0xac, RK3328_POST_PLL_FB_DIV_7_0(cfg->fbdiv));
- 	if (cfg->postdiv == 1) {
--		inno_write(inno, 0xaa, RK3328_POST_PLL_REFCLK_SEL_TMDS);
- 		inno_write(inno, 0xab, RK3328_POST_PLL_FB_DIV_8(cfg->fbdiv) |
- 			   RK3328_POST_PLL_PRE_DIV(cfg->prediv));
-+		inno_write(inno, 0xaa, RK3328_POST_PLL_REFCLK_SEL_TMDS |
-+			   RK3328_POST_PLL_POWER_DOWN);
- 	} else {
- 		v = (cfg->postdiv / 2) - 1;
- 		v &= RK3328_POST_PLL_POST_DIV_MASK;
-@@ -1030,7 +1031,8 @@ inno_hdmi_phy_rk3328_power_on(struct inno_hdmi_phy *inno,
- 		inno_write(inno, 0xab, RK3328_POST_PLL_FB_DIV_8(cfg->fbdiv) |
- 			   RK3328_POST_PLL_PRE_DIV(cfg->prediv));
- 		inno_write(inno, 0xaa, RK3328_POST_PLL_POST_DIV_ENABLE |
--			   RK3328_POST_PLL_REFCLK_SEL_TMDS);
-+			   RK3328_POST_PLL_REFCLK_SEL_TMDS |
-+			   RK3328_POST_PLL_POWER_DOWN);
- 	}
- 
- 	for (v = 0; v < 14; v++)
-
 From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
 From: Huicong Xu <xhc@rock-chips.com>
 Date: Sat, 10 Oct 2020 15:32:20 +0000
diff --git a/projects/Samsung/linux/linux.arm.conf b/projects/Samsung/linux/linux.arm.conf
index 3f1eb18a31..b4e17e3030 100644
--- a/projects/Samsung/linux/linux.arm.conf
+++ b/projects/Samsung/linux/linux.arm.conf
@@ -1575,7 +1575,7 @@ CONFIG_SCSI_DMA=y
 CONFIG_BLK_DEV_SD=y
 # CONFIG_CHR_DEV_ST is not set
 CONFIG_BLK_DEV_SR=y
-# CONFIG_CHR_DEV_SG is not set
+CONFIG_CHR_DEV_SG=m
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_CHR_DEV_SCH is not set
 # CONFIG_SCSI_CONSTANTS is not set
diff --git a/scripts/image b/scripts/image
index e5a91214c6..a24f892022 100755
--- a/scripts/image
+++ b/scripts/image
@@ -509,6 +509,9 @@ if [ "${1}" = "release" -o "${1}" = "mkimage" -o "${1}" = "noobs" ]; then
         -e "s%@SYSTEM_SIZE@%${SYSTEM_SIZE}%g" \
         -i ${RELEASE_DIR}/${NOOBS_DISTRO}/partitions.json
 
+    sed -e "s%@EXTRA_CMDLINE@%${EXTRA_CMDLINE}%g" \
+        -i ${RELEASE_DIR}/${NOOBS_DISTRO}/partition_setup.sh
+
     # Create System dir
     mkdir -p ${RELEASE_DIR}/${NOOBS_DISTRO}/System
 
@@ -520,6 +523,12 @@ if [ "${1}" = "release" -o "${1}" = "mkimage" -o "${1}" = "noobs" ]; then
       fi
     done
     cp -PR ${INSTALL}/usr/share/bootloader/LICENCE* ${RELEASE_DIR}/${NOOBS_DISTRO}/System/
+    for f in bootcode.bin fixup.dat start.elf ; do
+      if [ -f "${INSTALL}/usr/share/bootloader/$f" ]; then
+        cp -PR "${INSTALL}/usr/share/bootloader/$f" "${RELEASE_DIR}/${NOOBS_DISTRO}/System/"
+      fi
+    done
+
     cp -PR ${INSTALL}/usr/share/bootloader/bootcode.bin ${RELEASE_DIR}/${NOOBS_DISTRO}/System/
     cp -PR ${INSTALL}/usr/share/bootloader/fixup.dat ${RELEASE_DIR}/${NOOBS_DISTRO}/System/
     cp -PR ${INSTALL}/usr/share/bootloader/start.elf ${RELEASE_DIR}/${NOOBS_DISTRO}/System/
diff --git a/scripts/mkimage b/scripts/mkimage
index 0213f19019..ffb2c7c9b7 100755
--- a/scripts/mkimage
+++ b/scripts/mkimage
@@ -284,9 +284,12 @@ EOF
   mcopy "${RELEASE_DIR}/target/KERNEL.md5" "::/${KERNEL_NAME}.md5" >"${SAVE_ERROR}" 2>&1 || show_error
   mcopy "${RELEASE_DIR}/target/SYSTEM.md5" ::/SYSTEM.md5 >"${SAVE_ERROR}" 2>&1 || show_error
 
-  mcopy "${RELEASE_DIR}/3rdparty/bootloader/bootcode.bin" :: >"${SAVE_ERROR}" 2>&1 || show_error
-  mcopy "${RELEASE_DIR}/3rdparty/bootloader/fixup.dat" :: >"${SAVE_ERROR}" 2>&1 || show_error
-  mcopy "${RELEASE_DIR}/3rdparty/bootloader/start.elf" :: >"${SAVE_ERROR}" 2>&1 || show_error
+  for f in bootcode.bin fixup.dat start.elf ; do
+    if [ -f "${RELEASE_DIR}/3rdparty/bootloader/$f" ]; then
+      mcopy "${RELEASE_DIR}/3rdparty/bootloader/$f" :: >"${SAVE_ERROR}" 2>&1 || show_error
+    fi
+  done
+
   mcopy "${RELEASE_DIR}/3rdparty/bootloader/config.txt" :: >"${SAVE_ERROR}" 2>&1 || show_error
   for distro in "${RELEASE_DIR}/3rdparty/bootloader/distroconfig"*.txt ; do
     if [ -f "${distro}" ]; then