x86: dsputil: port to cpuflags

2025-02-18 19:10:21 +00:00 · 2012-07-15 15:45:11 +02:00 · 2012-07-15 15:45:11 +02:00 · 8c3849bc76
commit 8c3849bc76
parent 8ff0f7763d
2 changed files with 120 additions and 128 deletions
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@ -33,9 +33,9 @@ pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12

 SECTION_TEXT

-%macro SCALARPRODUCT 1
+%macro SCALARPRODUCT 0
 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
-cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
+cglobal scalarproduct_int16, 3,3,3, v1, v2, order
    shl orderq, 1
    add v1q, orderq
    add v2q, orderq
@ -62,7 +62,7 @@ cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
    RET

 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
+cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
 %if mmsize == 16
@ -107,10 +107,10 @@ cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
    RET
 %endmacro

-INIT_MMX
-SCALARPRODUCT mmxext
-INIT_XMM
-SCALARPRODUCT sse2
+INIT_MMX mmxext
+SCALARPRODUCT
+INIT_XMM sse2
+SCALARPRODUCT

 %macro SCALARPRODUCT_LOOP 1
 align 16
@ -158,7 +158,8 @@ align 16
 %endmacro

 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
-cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
+INIT_XMM ssse3
+cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
    shl orderq, 1
    movd    m7, mulm
    pshuflw m7, m7, 0
@ -207,48 +208,60 @@ SCALARPRODUCT_LOOP 0
 ;                            const int16_t *window, unsigned int len)
 ;-----------------------------------------------------------------------------

-%macro REVERSE_WORDS_MMXEXT 1-2
-    pshufw   %1, %1, 0x1B
-%endmacro
-
-%macro REVERSE_WORDS_SSE2 1-2
+%macro REVERSE_WORDS 1-2
+%if cpuflag(ssse3) && notcpuflag(atom)
+    pshufb  %1, %2
+%elif cpuflag(sse2)
    pshuflw  %1, %1, 0x1B
    pshufhw  %1, %1, 0x1B
    pshufd   %1, %1, 0x4E
+%elif cpuflag(mmxext)
+    pshufw   %1, %1, 0x1B
+%endif
 %endmacro

-%macro REVERSE_WORDS_SSSE3 2
-    pshufb  %1, %2
-%endmacro
-
+%macro MUL16FIXED 3
+%if cpuflag(ssse3) ; dst, src, unused
+; dst = ((dst * src) + (1<<14)) >> 15
+    pmulhrsw   %1, %2
+%elif cpuflag(mmxext) ; dst, src, temp
 ; dst = (dst * src) >> 15
 ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
 ; in from the pmullw result.
-%macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
    mova    %3, %1
    pmulhw  %1, %2
    pmullw  %3, %2
    psrlw   %3, 15
    psllw   %1, 1
    por     %1, %3
+%endif
 %endmacro

-; dst = ((dst * src) + (1<<14)) >> 15
-%macro MUL16FIXED_SSSE3 3 ; dst, src, unused
-    pmulhrsw   %1, %2
-%endmacro
-
-%macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
-cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
+%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
+%if %1
+cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
+%else
+cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
+%endif
    lea     offset2q, [offsetq-mmsize]
-%if %2
-    mova          m5, [pd_16384]
-%elifidn %1, ssse3
+%if cpuflag(ssse3) && notcpuflag(atom)
    mova          m5, [pb_revwords]
    ALIGN 16
+%elif %1
+    mova          m5, [pd_16384]
 %endif
 .loop:
-%if %2
+%if cpuflag(ssse3)
+    ; This version does the 16x16->16 multiplication in-place without expanding
+    ; to 32-bit. The ssse3 version is bit-identical.
+    mova          m0, [windowq+offset2q]
+    mova          m1, [ inputq+offset2q]
+    pmulhrsw      m1, m0
+    REVERSE_WORDS m0, m5
+    pmulhrsw      m0, [ inputq+offsetq ]
+    mova  [outputq+offset2q], m1
+    mova  [outputq+offsetq ], m0
+%elif %1
    ; This version expands 16-bit to 32-bit, multiplies by the window,
    ; adds 16384 for rounding, right shifts 15, then repacks back to words to
    ; save to the output. The window is reversed for the second half.
@ -284,16 +297,6 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
    psrad         m2, 15
    packssdw      m0, m2
    mova  [outputq+offsetq], m0
-%elif %3
-    ; This version does the 16x16->16 multiplication in-place without expanding
-    ; to 32-bit. The ssse3 version is bit-identical.
-    mova          m0, [windowq+offset2q]
-    mova          m1, [ inputq+offset2q]
-    pmulhrsw      m1, m0
-    REVERSE_WORDS m0, m5
-    pmulhrsw      m0, [ inputq+offsetq ]
-    mova  [outputq+offset2q], m1
-    mova  [outputq+offsetq ], m0
 %else
    ; This version does the 16x16->16 multiplication in-place without expanding
    ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
@ -313,22 +316,24 @@ cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
    REP_RET
 %endmacro

-INIT_MMX
-%define REVERSE_WORDS REVERSE_WORDS_MMXEXT
-%define MUL16FIXED MUL16FIXED_MMXEXT
-APPLY_WINDOW_INT16 mmxext,     0, 0
-APPLY_WINDOW_INT16 mmxext_ba,  1, 0
-INIT_XMM
-%define REVERSE_WORDS REVERSE_WORDS_SSE2
-APPLY_WINDOW_INT16 sse2,       0, 0
-APPLY_WINDOW_INT16 sse2_ba,    1, 0
-APPLY_WINDOW_INT16 ssse3_atom, 0, 1
-%define REVERSE_WORDS REVERSE_WORDS_SSSE3
-APPLY_WINDOW_INT16 ssse3,      0, 1
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 0
+INIT_XMM sse2
+APPLY_WINDOW_INT16 0
+
+INIT_MMX mmxext
+APPLY_WINDOW_INT16 1
+INIT_XMM sse2
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3
+APPLY_WINDOW_INT16 1
+INIT_XMM ssse3, atom
+APPLY_WINDOW_INT16 1


 ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
-cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, top, diff, w, left, left_top
+INIT_MMX mmxext
+cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
    movq    mm0, [topq]
    movq    mm2, mm0
    movd    mm4, [left_topq]
@ -430,8 +435,8 @@ cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, top, diff, w, left, left_
 %endmacro

 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
-INIT_MMX
-cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
+INIT_MMX ssse3
+cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
 .skip_prologue:
    mova    m5, [pb_7]
    mova    m4, [pb_zzzz3333zzzzbbbb]
@ -440,8 +445,8 @@ cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
    psllq   m0, 56
    ADD_HFYU_LEFT_LOOP 1, 1

-INIT_XMM
-cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
+INIT_XMM sse4
+cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
    mova    m5, [pb_f]
    mova    m6, [pb_zzzzzzzz77777777]
    mova    m4, [pb_zzzz3333zzzzbbbb]
@ -460,7 +465,8 @@ cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left


 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
-cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
+INIT_XMM sse
+cglobal scalarproduct_float, 3,3,2, v1, v2, offset
    neg offsetq
    shl offsetq, 2
    sub v1q, offsetq
@ -1243,15 +1249,20 @@ BUTTERFLIES_FLOAT_INTERLEAVE
 INIT_YMM avx
 BUTTERFLIES_FLOAT_INTERLEAVE

-INIT_XMM sse2
 ; %1 = aligned/unaligned
-%macro BSWAP_LOOPS_SSE2  1
+%macro BSWAP_LOOPS  1
    mov      r3, r2
    sar      r2, 3
    jz       .left4_%1
 .loop8_%1:
    mov%1    m0, [r1 +  0]
    mov%1    m1, [r1 + 16]
+%if cpuflag(ssse3)
+    pshufb   m0, m2
+    pshufb   m1, m2
+    mova     [r0 +  0], m0
+    mova     [r0 + 16], m1
+%else
    pshuflw  m0, m0, 10110001b
    pshuflw  m1, m1, 10110001b
    pshufhw  m0, m0, 10110001b
@ -1266,8 +1277,9 @@ INIT_XMM sse2
    por      m3, m1
    mova     [r0 +  0], m2
    mova     [r0 + 16], m3
-    add      r1, 32
+%endif
    add      r0, 32
+    add      r1, 32
    dec      r2
    jnz      .loop8_%1
 .left4_%1:
@ -1275,6 +1287,10 @@ INIT_XMM sse2
    and      r3, 4
    jz       .left
    mov%1    m0, [r1]
+%if cpuflag(ssse3)
+    pshufb   m0, m2
+    mova     [r0], m0
+%else
    pshuflw  m0, m0, 10110001b
    pshufhw  m0, m0, 10110001b
    mova     m2, m0
@ -1282,72 +1298,29 @@ INIT_XMM sse2
    psrlw    m2, 8
    por      m2, m0
    mova     [r0], m2
+%endif
    add      r1, 16
    add      r0, 16
 %endmacro

 ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
-cglobal bswap32_buf, 3,4,5
-    mov      r3, r1
-    and      r3, 15
-    jz       .start_align
-    BSWAP_LOOPS_SSE2  u
-    jmp      .left
-.start_align:
-    BSWAP_LOOPS_SSE2  a
-.left:
-    and      r2, 3
-    jz       .end
-.loop2:
-    mov      r3d, [r1]
-    bswap    r3d
-    mov      [r0], r3d
-    add      r1, 4
-    add      r0, 4
-    dec      r2
-    jnz      .loop2
-.end:
-    RET
-
-; %1 = aligned/unaligned
-%macro BSWAP_LOOPS_SSSE3  1
-    mov      r3, r2
-    sar      r2, 3
-    jz       .left4_%1
-.loop8_%1:
-    mov%1    m0, [r1 +  0]
-    mov%1    m1, [r1 + 16]
-    pshufb   m0, m2
-    pshufb   m1, m2
-    mova     [r0 +  0], m0
-    mova     [r0 + 16], m1
-    add      r0, 32
-    add      r1, 32
-    dec      r2
-    jnz      .loop8_%1
-.left4_%1:
-    mov      r2, r3
-    and      r3, 4
-    jz       .left2
-    mov%1    m0, [r1]
-    pshufb   m0, m2
-    mova     [r0], m0
-    add      r1, 16
-    add      r0, 16
-%endmacro
-
-INIT_XMM ssse3
-; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
+%macro BSWAP32_BUF 0
+%if cpuflag(ssse3)
 cglobal bswap32_buf, 3,4,3
    mov      r3, r1
    mova     m2, [pb_bswap32]
+%else
+cglobal bswap32_buf, 3,4,5
+    mov      r3, r1
+%endif
    and      r3, 15
    jz       .start_align
-    BSWAP_LOOPS_SSSE3  u
-    jmp      .left2
+    BSWAP_LOOPS  u
+    jmp      .left
 .start_align:
-    BSWAP_LOOPS_SSSE3  a
-.left2:
+    BSWAP_LOOPS  a
+.left:
+%if cpuflag(ssse3)
    mov      r3, r2
    and      r2, 2
    jz       .left1
@ -1362,5 +1335,24 @@ cglobal bswap32_buf, 3,4,3
    mov      r2d, [r1]
    bswap    r2d
    mov      [r0], r2d
+%else
+    and      r2, 3
+    jz       .end
+.loop2:
+    mov      r3d, [r1]
+    bswap    r3d
+    mov      [r0], r3d
+    add      r1, 4
+    add      r0, 4
+    dec      r2
+    jnz      .loop2
+%endif
 .end:
    RET
+%endmacro
+
+INIT_XMM sse2
+BSWAP32_BUF
+
+INIT_XMM ssse3
+BSWAP32_BUF
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@ -2297,16 +2297,16 @@ int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
                                              const int16_t *v3,
                                              int order, int mul);

-void ff_apply_window_int16_mmxext    (int16_t *output, const int16_t *input,
-                                      const int16_t *window, unsigned int len);
-void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
-                                      const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2      (int16_t *output, const int16_t *input,
-                                      const int16_t *window, unsigned int len);
-void ff_apply_window_int16_sse2_ba   (int16_t *output, const int16_t *input,
-                                      const int16_t *window, unsigned int len);
-void ff_apply_window_int16_ssse3     (int16_t *output, const int16_t *input,
+void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
+                                        const int16_t *window, unsigned int len);
+void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);
+void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
+                                  const int16_t *window, unsigned int len);
+void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
+                                const int16_t *window, unsigned int len);
+void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
+                                 const int16_t *window, unsigned int len);
 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
                                      const int16_t *window, unsigned int len);

@ -2568,9 +2568,9 @@ static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
    c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;

    if (avctx->flags & CODEC_FLAG_BITEXACT) {
-        c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
-    } else {
        c->apply_window_int16 = ff_apply_window_int16_mmxext;
+    } else {
+        c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
    }
 #endif /* HAVE_YASM */
 }
@ -2758,9 +2758,9 @@ static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
        c->vector_clip_int32 = ff_vector_clip_int32_sse2;
    }
    if (avctx->flags & CODEC_FLAG_BITEXACT) {
-        c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
-    } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
        c->apply_window_int16 = ff_apply_window_int16_sse2;
+    } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
+        c->apply_window_int16 = ff_apply_window_int16_round_sse2;
    }
    c->bswap_buf = ff_bswap32_buf_sse2;
 #endif /* HAVE_YASM */