From cdb1665f70def544ddab3e3ed3763ef99c8b3873 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 25 Mar 2016 23:44:10 +0200
Subject: [PATCH] aarch64: Make transpose_4x4H do a regular transpose
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, ff_h264_idct_add_neon (originally in the arm version) used
a non-regular transpose in order to be able to use more instructions
that deal with registers as 128 bit register pairs. The aarch64
translation doesn't do it to the same extent, but brought along the
same structure since it was a straight translation.

This reshuffles ff_h264_idct_add_neon, bringing it closer to
the C implementation, making the transpose_4x4H macro do a regular
transpose, usable for other algorithms as well.

Previously, the third and fourth output from transpose_4x4H were
swapped, and prior to cc29d96d5a, the same inputs as well. In
addition to just swapping the outputs, also renumber the intermediate
registers for better readability (making the register order match
transpose_4x8B).

This runs with the same number of cycles as before.

Signed-off-by: Martin Storsjö <martin@martin.st>
---
 libavcodec/aarch64/h264idct_neon.S | 24 ++++++++++++------------
 libavcodec/aarch64/neon.S          | 12 ++++++------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/libavcodec/aarch64/h264idct_neon.S b/libavcodec/aarch64/h264idct_neon.S
index 78f780a632..5395e146ef 100644
--- a/libavcodec/aarch64/h264idct_neon.S
+++ b/libavcodec/aarch64/h264idct_neon.S
@@ -33,25 +33,25 @@ function ff_h264_idct_add_neon, export=1
         sshr            v17.4H, v3.4H,  #1
         st1             {v30.8H},    [x1], #16
         sub             v5.4H,  v0.4H,  v2.4H
-        add             v6.4H,  v1.4H,  v17.4H
-        sub             v7.4H,  v16.4H, v3.4H
-        add             v0.4H,  v4.4H,  v6.4H
-        add             v1.4H,  v5.4H,  v7.4H
-        sub             v3.4H,  v4.4H,  v6.4H
-        sub             v2.4H,  v5.4H,  v7.4H
+        sub             v6.4H,  v16.4H, v3.4H
+        add             v7.4H,  v1.4H,  v17.4H
+        add             v0.4H,  v4.4H,  v7.4H
+        add             v1.4H,  v5.4H,  v6.4H
+        sub             v2.4H,  v5.4H,  v6.4H
+        sub             v3.4H,  v4.4H,  v7.4H
 
         transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
 
-        add             v4.4H,  v0.4H,  v3.4H
+        add             v4.4H,  v0.4H,  v2.4H
         ld1             {v18.S}[0], [x0], x2
-        sshr            v16.4H,  v2.4H,  #1
+        sshr            v16.4H,  v3.4H,  #1
         sshr            v17.4H,  v1.4H,  #1
-        ld1             {v19.S}[1], [x0], x2
-        sub             v5.4H,  v0.4H,  v3.4H
         ld1             {v18.S}[1], [x0], x2
+        sub             v5.4H,  v0.4H,  v2.4H
+        ld1             {v19.S}[1], [x0], x2
         add             v6.4H,  v16.4H, v1.4H
         ins             v4.D[1],  v5.D[0]
-        sub             v7.4H,  v2.4H,  v17.4H
+        sub             v7.4H,  v17.4H, v3.4H
         ld1             {v19.S}[0], [x0], x2
         ins             v6.D[1],  v7.D[0]
         sub             x0,  x0,  x2, lsl #2
@@ -68,8 +68,8 @@ function ff_h264_idct_add_neon, export=1
         sqxtun          v1.8B, v1.8H
 
         st1             {v0.S}[0],  [x0], x2
-        st1             {v1.S}[1],  [x0], x2
         st1             {v0.S}[1],  [x0], x2
+        st1             {v1.S}[1],  [x0], x2
         st1             {v1.S}[0],  [x0], x2
 
         sub             x1,  x1,  #32
diff --git a/libavcodec/aarch64/neon.S b/libavcodec/aarch64/neon.S
index 767bc9d455..377009e244 100644
--- a/libavcodec/aarch64/neon.S
+++ b/libavcodec/aarch64/neon.S
@@ -107,12 +107,12 @@
 .macro  transpose_4x4H  r0, r1, r2, r3, r4, r5, r6, r7
         trn1            \r4\().4H,  \r0\().4H,  \r1\().4H
         trn2            \r5\().4H,  \r0\().4H,  \r1\().4H
-        trn1            \r7\().4H,  \r2\().4H,  \r3\().4H
-        trn2            \r6\().4H,  \r2\().4H,  \r3\().4H
-        trn1            \r0\().2S,  \r4\().2S,  \r7\().2S
-        trn2            \r3\().2S,  \r4\().2S,  \r7\().2S
-        trn1            \r1\().2S,  \r5\().2S,  \r6\().2S
-        trn2            \r2\().2S,  \r5\().2S,  \r6\().2S
+        trn1            \r6\().4H,  \r2\().4H,  \r3\().4H
+        trn2            \r7\().4H,  \r2\().4H,  \r3\().4H
+        trn1            \r0\().2S,  \r4\().2S,  \r6\().2S
+        trn2            \r2\().2S,  \r4\().2S,  \r6\().2S
+        trn1            \r1\().2S,  \r5\().2S,  \r7\().2S
+        trn2            \r3\().2S,  \r5\().2S,  \r7\().2S
 .endm
 
 .macro  transpose_8x8H  r0, r1, r2, r3, r4, r5, r6, r7, r8, r9