dos/exe improvements, not enabled yet.

2024-11-30 16:11:08 +00:00 · 2007-03-06 15:08:24 +01:00 · 2007-03-06 15:08:24 +01:00 · d29c8df45e
commit d29c8df45e
parent d9722ad6b3
5 changed files with 695 additions and 692 deletions
--- a/src/stub/src/arch/i086/cc_test.c
+++ b/src/stub/src/arch/i086/cc_test.c
@ -103,9 +103,6 @@ void __pascal p4fshlv_v(unsigned char v, uint32_t __far *a) { *a <<= v; }
 uint32_t __pascal p4nshlv(unsigned char v, uint32_t __near *a) { return *a <<= v; }
 uint32_t __pascal p4fshlv(unsigned char v, uint32_t __far *a) { return *a <<= v; }

-uint32_t __cdecl shlv_2(uint16_t h, uint16_t l, unsigned v)
-{ uint16_t x = l >> (16 - v); l <<= v; h <<= v; h |= x; return h * 65536ul + l; }
-
 hptrdiff_t __cdecl hptr2int(hptr a) { return (hptrdiff_t) a; }
 hptr __cdecl int2hptr(hptrdiff_t a) { return (hptr) a; }

--- a/src/stub/src/arch/i086/cleanasm.py
+++ b/src/stub/src/arch/i086/cleanasm.py
@ -195,29 +195,42 @@ def main(argv):
        if opts.call_rewrite and inst in ["call"]:
            k, v = parse_label(inst, args)
            if v[:2] == [1, 2]:     # external 2-byte
+                if k == "__aNahdiff":
+                    s = [
+                        ["push", "word ptr [bp+8]"],
+                        ["push", "word ptr [bp+6]"],
+                        ["push", r"word ptr \[bp([+-]\d+)\]$"],
+                        ["push", r"word ptr \[bp([+-]\d+)\]$"],
+                    ]
+                    dpos = omatch(i-1, -4, s)
+                    if dpos:
+                        orewrite_inst(i, "*DEL*", "", dpos)
+                        continue
                if k in ["__LMUL", "__U4M",]:
+                    s1 = [
+                        ["mov",  "bx,0x300"],
+                        ["xor",  "cx,cx"],
+                    ]
+                    s2 = [
+                        ["shl",  "ax,1"],
+                        ["rcl",  "dx,1"],
+                    ]
+                    dpos1 = omatch(i-1, -2, s1)
+                    dpos2 = omatch(i+1,  2, s2)
+                    if dpos1 and dpos2:
+                        orewrite_inst(i, "M_U4M_dxax_0x0600", "", dpos1 + dpos2)
+                        continue
                    s = [
                        ["mov",  "bx,word ptr [bx]"],
                        ["xor",  "cx,cx"],
                    ]
                    dpos = omatch(i-1, -2, s, debug=0)
                    if 0 and dpos:
-                        orewrite_inst(i, "M_LMUL_dxax_00bx_ptr", "", dpos)
+                        orewrite_inst(i, "M_U4M_dxax_00bx_ptr", "", dpos)
                        continue
                    dpos = omatch(i-1, -1, s)
                    if dpos:
-                        orewrite_inst(i, "M_LMUL_dxax_00bx", "", dpos)
-                        continue
-                if k == "__aNahdiff":
-                    s = [
-                        ["push", "word ptr [bp+8]"],
-                        ["push", "word ptr [bp+6]"],
-                        ["push", "word ptr [bp-66]"],
-                        ["push", "word ptr [bp-68]"],
-                    ]
-                    dpos = omatch(i-1, -4, s)
-                    if dpos:
-                        orewrite_inst(i, "*DEL*", "", dpos)
+                        orewrite_inst(i, "M_U4M_dxax_00bx", "", dpos)
                        continue
                if k == "__PIA":
                    s = [
@ -257,6 +270,36 @@ def main(argv):
            if dpos:
                orewrite_inst(i, "M_shld_8", "", dpos)
                continue
+            s1 = [
+                ["mov",  r"^c[lx],0x8$"],
+                ["shl",  "si,1"],
+                ["rcl",  "di,1"],
+            ]
+            s2 = [
+                ["les",  r"^bx,dword ptr \[bp([+-]\d+)\]$"],
+            ]
+            dpos1 = omatch(i-1, -3, s1)
+            dpos2 = omatch(i+1,  1, s2)
+            if 1 and dpos1 and dpos2:
+                # bx and cx are free for use
+                orewrite_inst(i, "M_shld_disi_8_bxcx", "", dpos1)
+                continue
+            s1 = [
+                ["mov",  "ax,si"],
+                ["mov",  r"^c[lx],0x8$"],
+                ["shl",  "ax,1"],
+                ["rcl",  "di,1"],
+            ]
+            s2 = [
+                ["mov",  "si,ax"],
+                ["les",  r"^bx,dword ptr \[bp([+-]\d+)\]$"],
+            ]
+            dpos1 = omatch(i-1, -4, s1)
+            dpos2 = omatch(i+1,  2, s2)
+            if 1 and dpos1 and dpos2:
+                # bx and cx are free for use
+                orewrite_inst(i, "M_shld_diax_8_bxcx", "", dpos1[-3:])
+                continue
            s1 = [
                ["mov",  r"^c[lx],0x8$"],
                ["shl",  r"^word ptr \[bp([+-]\d+)\],1$"],
--- a/src/stub/src/arch/i086/lzma_d_cf.S
+++ b/src/stub/src/arch/i086/lzma_d_cf.S
--- a/src/stub/src/arch/i086/lzma_d_cs.S
+++ b/src/stub/src/arch/i086/lzma_d_cs.S
--- a/src/stub/src/arch/i086/lzma_m.h
+++ b/src/stub/src/arch/i086/lzma_m.h
@ -144,11 +144,11 @@ L1:


 /*************************************************************************
-// support macros: LMUL, shld, shrd
+// support macros: U4M, shld, shrd
 **************************************************************************/

 // umul32: dx:ax = dx:ax * 00:bx
-.macro  M_LMUL_dxax_00bx
+.macro  M_U4M_dxax_00bx
        // mult high-word
        mov     cx, ax      // save ax
        mov     ax, dx
@ -162,7 +162,7 @@ L1:


 // umul32: dx:ax = dx:ax * word ptr [bx]
-.macro  M_LMUL_dxax_00bx_ptr
+.macro  M_U4M_dxax_00bx_ptr
        // mult high-word
        mov     cx, ax      // save ax
        mov     ax, dx
@ -176,7 +176,7 @@ L1:


 // umul32: dx:ax = ax:cx * 00:bx
-.macro  M_LMUL_axcx_00bx
+.macro  M_U4M_axcx_00bx
        // mult high-word
        mul     bx
        xchg    ax, cx      // save high-word result, get low
@ -187,14 +187,95 @@ L1:
 .endm


+// umul32: dx:ax = dx:ax * 0x0600
+.macro  M_U4M_dxax_0x0600
+    // FIXME: compute clocks and optimize this
+#if 1
+        // code size: 18 bytes
+        // i086: > 140 clocks (mul needs 70 clocks)
+        // i286: >  26 clocks (mul needs 13 clocks)
+        mov     bx, 0x300
+        M_U4M_dxax_00bx
+        shl     ax
+        rcl     dx
+#elif 1
+        // code size: 14 bytes
+        // i086: > 140 clocks (mul needs 70 clocks)
+        // i286: >  26 clocks (mul needs 13 clocks)
+        mov     bx, 0x600
+        M_U4M_dxax_00bx
+#else
+        // code size: 16+8 == 24 bytes
+        // i086: 18+9 == 27 clocks
+        // i286: 16+8 == 24 clocks
+    // FIXME: can we further optimize this ?
+        shl     ax
+        rcl     dx          // dx:ax <<= 1      v * 2
+        mov     cx, dx
+        mov     bx, ax      // cx:bx = dx:ax    v * 2
+        shl     ax
+        rcl     dx          // dx:ax <<= 1      v * 4
+        add     ax, bx
+        adc     dx, cx      // dx:ax += cx:bx   v * 6
+        M_shld_8            // dx:ax <<= 8      v * 0x600
+#endif
+.endm
+
+
 // shld: dx:ax <<= 8
 .macro  M_shld_8
+        // code size: 8 bytes
+        // i086: 9 clocks
+        // i286: 8 clocks
        mov     dh, dl
        mov     dl, ah
        mov     ah, al
        xor     al, al
 .endm

+
+// shld: di:si <<= 8; bx and cx are free
+.macro  M_shld_disi_8_bxcx
+    // FIXME: compute clocks and optimize this
+#if 1
+        local   L1
+        mov     cx, 8
+L1:     shl     si
+        rcl     di
+        loop    L1
+#else
+        mov     bx, di
+        mov     cx, si
+        mov     bh, bl
+        mov     bl, ch
+        mov     ch, cl
+        xor     cl, cl
+        mov     di, bx
+        mov     si, cx
+#endif
+.endm
+
+
+// shld: di:ax <<= 8; bx and cx are free
+.macro  M_shld_diax_8_bxcx
+    // FIXME: compute clocks and optimize this
+#if 1
+        local   L1
+        mov     cx, 8
+L1:     shl     ax
+        rcl     di
+        loop    L1
+#else
+        mov     bx, di
+        mov     bh, bl
+        mov     bl, ah
+        mov     ah, al
+        xor     al, al
+        mov     di, bx
+#endif
+.endm
+
+
 .macro  M_shld_8_bp h l
        mov     dx, word ptr[bp+h]
        mov     ax, word ptr[bp+l]