mirror of
https://github.com/RPCS3/llvm.git
synced 2024-11-25 12:49:50 +00:00
avoid going through a stack slot to convert from fpstack to xmm reg
if we are just going to store it back anyway. This improves things like: double foo(); void bar(double *P) { *P = foo(); } git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@45399 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
3c87285af6
commit
112dedc520
@ -1636,24 +1636,3 @@ a stride-4 IV, would would allow all the scales in the loop to go away.
|
||||
This would result in smaller code and more efficient microops.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
||||
We should be smarter about conversion from fpstack to XMM regs.
|
||||
|
||||
double foo();
|
||||
void bar(double *P) { *P = foo(); }
|
||||
|
||||
We compile that to:
|
||||
|
||||
_bar:
|
||||
subl $12, %esp
|
||||
call L_foo$stub
|
||||
fstpl (%esp)
|
||||
movl 16(%esp), %eax
|
||||
movsd (%esp), %xmm0
|
||||
movsd %xmm0, (%eax)
|
||||
addl $12, %esp
|
||||
ret
|
||||
|
||||
for example. The magic to/from the stack is unneeded.
|
||||
|
||||
//===---------------------------------------------------------------------===//
|
||||
|
@ -33,7 +33,6 @@
|
||||
#include "llvm/CodeGen/SelectionDAG.h"
|
||||
#include "llvm/CodeGen/SSARegMap.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/ADT/SmallSet.h"
|
||||
@ -812,7 +811,6 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
|
||||
CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
|
||||
CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);
|
||||
|
||||
|
||||
SmallVector<SDOperand, 8> ResultVals;
|
||||
|
||||
// Copy all of the result registers out of their specified physreg.
|
||||
@ -838,17 +836,50 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
|
||||
// an XMM register.
|
||||
if ((X86ScalarSSEf32 && RVLocs[0].getValVT() == MVT::f32) ||
|
||||
(X86ScalarSSEf64 && RVLocs[0].getValVT() == MVT::f64)) {
|
||||
SDOperand StoreLoc;
|
||||
const Value *SrcVal = 0;
|
||||
int SrcValOffset = 0;
|
||||
|
||||
// Determine where to store the value. If the call result is directly
|
||||
// used by a store, see if we can store directly into the location. In
|
||||
// this case, we'll end up producing a fst + movss[load] + movss[store] to
|
||||
// the same location, and the two movss's will be nuked as dead. This
|
||||
// optimizes common things like "*D = atof(..)" to not need an
|
||||
// intermediate stack slot.
|
||||
if (SDOperand(TheCall, 0).hasOneUse() &&
|
||||
SDOperand(TheCall, 1).hasOneUse()) {
|
||||
// Ok, we have one use of the value and one use of the chain. See if
|
||||
// they are the same node: a store.
|
||||
if (StoreSDNode *N = dyn_cast<StoreSDNode>(*TheCall->use_begin())) {
|
||||
if (N->getChain().Val == TheCall && N->getValue().Val == TheCall &&
|
||||
!N->isVolatile() && !N->isTruncatingStore() &&
|
||||
N->getAddressingMode() == ISD::UNINDEXED) {
|
||||
StoreLoc = N->getBasePtr();
|
||||
SrcVal = N->getSrcValue();
|
||||
SrcValOffset = N->getSrcValueOffset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we weren't able to optimize the result, just create a temporary
|
||||
// stack slot.
|
||||
if (StoreLoc.Val == 0) {
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
|
||||
StoreLoc = DAG.getFrameIndex(SSFI, getPointerTy());
|
||||
}
|
||||
|
||||
// FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
|
||||
// shouldn't be necessary except that RFP cannot be live across
|
||||
// multiple blocks. When stackifier is fixed, they can be uncoupled.
|
||||
MachineFunction &MF = DAG.getMachineFunction();
|
||||
int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
|
||||
SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
|
||||
// multiple blocks (which could happen if a select gets lowered into
|
||||
// multiple blocks and scheduled in between them). When stackifier is
|
||||
// fixed, they can be uncoupled.
|
||||
SDOperand Ops[] = {
|
||||
Chain, RetVal, StackSlot, DAG.getValueType(RVLocs[0].getValVT()), InFlag
|
||||
Chain, RetVal, StoreLoc, DAG.getValueType(RVLocs[0].getValVT()), InFlag
|
||||
};
|
||||
Chain = DAG.getNode(X86ISD::FST, MVT::Other, Ops, 5);
|
||||
RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain, StackSlot, NULL, 0);
|
||||
RetVal = DAG.getLoad(RVLocs[0].getValVT(), Chain,
|
||||
StoreLoc, SrcVal, SrcValOffset);
|
||||
Chain = RetVal.getValue(1);
|
||||
}
|
||||
ResultVals.push_back(RetVal);
|
||||
|
15
test/CodeGen/X86/fp-stack-ret-store.ll
Normal file
15
test/CodeGen/X86/fp-stack-ret-store.ll
Normal file
@ -0,0 +1,15 @@
|
||||
; RUN: llvm-as < %s | llc | not grep movss
|
||||
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
|
||||
target triple = "i686-apple-darwin8"
|
||||
|
||||
; This should store directly into P from the FP stack. It should not
|
||||
; go through a stack slot to get there.
|
||||
|
||||
define void @bar(double* %P) {
|
||||
entry:
|
||||
%tmp = tail call double (...)* @foo( ) ; <double> [#uses=1]
|
||||
store double %tmp, double* %P, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
declare double @foo(...)
|
Loading…
Reference in New Issue
Block a user