mirror of
https://github.com/RPCSX/llvm.git
synced 2025-01-25 05:34:59 +00:00
87c6c9abb3
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@35480 91177308-0d34-0410-b5e6-96231b3b80d8
175 lines
4.9 KiB
Plaintext
175 lines
4.9 KiB
Plaintext
//===---------------------------------------------------------------------===//
|
|
// Random ideas for the ARM backend (Thumb specific).
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
* Add support for compiling functions in both ARM and Thumb mode, then taking
|
|
the smallest.
|
|
* Add support for compiling individual basic blocks in thumb mode, when in a
|
|
larger ARM function. This can be used for presumed cold code, like paths
|
|
to abort (failure path of asserts), EH handling code, etc.
|
|
|
|
* Thumb doesn't have normal pre/post increment addressing modes, but you can
|
|
load/store 32-bit integers with pre/postinc by using load/store multiple
|
|
instrs with a single register.
|
|
|
|
* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
|
|
and cmp instructions can use high registers. Also, we can use them as
|
|
temporaries to spill values into.
|
|
|
|
* In thumb mode, short, byte, and bool preferred alignments are currently set
|
|
to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
|
|
of 4).
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
Potential jumptable improvements:
|
|
|
|
* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
|
|
jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
|
|
function is even smaller. This also applies to ARM.
|
|
|
|
* Thumb jumptable codegen can improve given some help from the assembler. This
|
|
is what we generate right now:
|
|
|
|
.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
|
|
LPCRELL0:
|
|
mov r1, #PCRELV0
|
|
add r1, pc
|
|
ldr r0, [r0, r1]
|
|
cpy pc, r0
|
|
.align 2
|
|
LJTI1_0_0:
|
|
.long LBB1_3
|
|
...
|
|
|
|
Note there is another pc relative add that we can take advantage of.
|
|
add r1, pc, #imm_8 * 4
|
|
|
|
We should be able to generate:
|
|
|
|
LPCRELL0:
|
|
add r1, LJTI1_0_0
|
|
ldr r0, [r0, r1]
|
|
cpy pc, r0
|
|
.align 2
|
|
LJTI1_0_0:
|
|
.long LBB1_3
|
|
|
|
if the assembler can translate the add to:
|
|
add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
|
|
|
|
Note the assembler also does something similar to constpool load:
|
|
LPCRELL0:
|
|
ldr r0, LCPI1_0
|
|
=>
|
|
ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
|
|
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
We compiles the following using a jump table.
|
|
|
|
define i16 @func_entry_2E_ce(i32 %i) {
|
|
newFuncRoot:
|
|
br label %entry.ce
|
|
|
|
bb12.exitStub: ; preds = %entry.ce
|
|
ret i16 0
|
|
|
|
bb4.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce
|
|
ret i16 1
|
|
|
|
bb9.exitStub: ; preds = %entry.ce, %entry.ce, %entry.ce
|
|
ret i16 2
|
|
|
|
bb.exitStub: ; preds = %entry.ce
|
|
ret i16 3
|
|
|
|
entry.ce: ; preds = %newFuncRoot
|
|
switch i32 %i, label %bb12.exitStub [
|
|
i32 0, label %bb4.exitStub
|
|
i32 1, label %bb9.exitStub
|
|
i32 2, label %bb4.exitStub
|
|
i32 3, label %bb4.exitStub
|
|
i32 7, label %bb9.exitStub
|
|
i32 8, label %bb.exitStub
|
|
i32 9, label %bb9.exitStub
|
|
]
|
|
}
|
|
|
|
gcc compiles to:
|
|
|
|
cmp r0, #9
|
|
@ lr needed for prologue
|
|
bhi L2
|
|
ldr r3, L11
|
|
mov r2, #1
|
|
mov r1, r2, asl r0
|
|
ands r0, r3, r2, asl r0
|
|
movne r0, #2
|
|
bxne lr
|
|
tst r1, #13
|
|
beq L9
|
|
L3:
|
|
mov r0, r2
|
|
bx lr
|
|
L9:
|
|
tst r1, #256
|
|
movne r0, #3
|
|
bxne lr
|
|
L2:
|
|
mov r0, #0
|
|
bx lr
|
|
L12:
|
|
.align 2
|
|
L11:
|
|
.long 642
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
When spilling in thumb mode and the sp offset is too large to fit in the ldr /
|
|
str offset field, we load the offset from a constpool entry and add it to sp:
|
|
|
|
ldr r2, LCPI
|
|
add r2, sp
|
|
ldr r2, [r2]
|
|
|
|
These instructions preserve the condition code which is important if the spill
|
|
is between a cmp and a bcc instruction. However, we can use the (potentially)
|
|
cheaper sequnce if we know it's ok to clobber the condition register.
|
|
|
|
add r2, sp, #255 * 4
|
|
add r2, #132
|
|
ldr r2, [r2, #7 * 4]
|
|
|
|
This is especially bad when dynamic alloca is used. The all fixed size stack
|
|
objects are referenced off the frame pointer with negative offsets. See
|
|
oggenc for an example.
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
We are reserving R3 as a scratch register under thumb mode. So if it is live in
|
|
to the function, we save / restore R3 to / from R12. Until register scavenging
|
|
is done, we should save R3 to a high callee saved reg at emitPrologue time
|
|
(when hasFP is true or stack size is large) and restore R3 from that register
|
|
instead. This allows us to at least get rid of the save to r12 everytime it is
|
|
used.
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
Poor codegen test/CodeGen/ARM/select.ll f7:
|
|
|
|
ldr r5, LCPI1_0
|
|
LPC0:
|
|
add r5, pc
|
|
ldr r6, LCPI1_1
|
|
ldr r2, LCPI1_2
|
|
cpy r3, r6
|
|
cpy lr, pc
|
|
bx r5
|
|
|
|
//===---------------------------------------------------------------------===//
|
|
|
|
Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
|
|
etc. Almost all Thumb instructions clobber condition code.
|